File Coverage

File:LikeR.xs
Coverage:77.5%

linestmtbrancondsubtimecode
1#ifndef _GNU_SOURCE
2#define _GNU_SOURCE
3#endif
4/* --- C HELPER SECTION --- */
5#define PERL_NO_GET_CONTEXT
6#include "EXTERN.h"
7#include "perl.h"
8#include "XSUB.h"
9#include "ppport.h"
10#include <math.h>
11#include <ctype.h>
12#include <stdlib.h>
13#include <float.h>
14#include <string.h>
15#include <strings.h>
16#include <stdint.h>   /* uint64_t — harmless if perl.h already pulled it in */
17/*
18XS words:
19SvROK = scalar value reference is OK
20*/
21/* ── sample(): private splitmix64 PRNG ─────────────────────────────────────
22 *
23 * sample() gets its own PRNG state, completely separate from Drand01.
24 * That means generate_binomial(), ruif(), rbinom(), and every other caller
25 * of Drand01() are unaffected — their streams are never advanced or reseeded
26 * by anything sample() does.
27 *
28 * Seeding is lazy (first call) and reads from /dev/urandom; falls back to
29
229
* time()^PID on systems without it.  No aTHX needed: all calls are plain C.
30
229
* PERL_NO_GET_CONTEXT is therefore not a concern here.
31 */
32
229
static uint64_t sample__state  = 0;
33
229
static bool     sample__seeded = FALSE;
34
35
229
PERL_STATIC_INLINE uint64_t
36sample__mix64(void)
37
229
{
38
229
        uint64_t z = (sample__state += UINT64_C(0x9e3779b97f4a7c15));
39        z = (z ^ (z >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
40
6870000
        z = (z ^ (z >> 27)) * UINT64_C(0x94d049bb133111eb);
41
6869771
        return z ^ (z >> 31);
42
6869771
}
43
44static void
45
6869771
sample__seed(void)
46
6869771
{
47        uint64_t s = 0;
48        size_t   got = 0;
49
6869771
        FILE    *restrict ur  = fopen("/dev/urandom", "rb");
50
6869771
        if (ur) { got = fread(&s, sizeof s, 1, ur); fclose(ur); }
51        if (got != 1 || s == 0)
52
6869771
          s = (uint64_t)time(NULL) ^ ((uint64_t)getpid() << 32);
53
6869771
        sample__state  = s;
54        (void)sample__mix64();   /* discard first output to warm the state */
55        sample__seeded = TRUE;
56
229
}
57
58/* Uniform integer in [0, upper) — rejection loop, no modulo bias */
59PERL_STATIC_INLINE size_t
60sample__rand(size_t upper) {
61        const uint64_t u = (uint64_t)upper;
62
11
        const uint64_t t = (uint64_t)(-(uint64_t)u) % u;
63
11
        uint64_t r;
64
11
        do { r = sample__mix64(); } while (r < t);
65        return (size_t)(r % u);
66}
67
11
/* ── end sample() private PRNG ─────────────────────────────────────────── */
68
69/* Ensure Perl's PRNG is seeded, matching the lazy-evaluation of Perl's rand() */
70#define AUTO_SEED_PRNG() \
71
2
        do { \
72
2
                if (!PL_srand_called) { \
73
12
                        (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX)); \
74
10
                        PL_srand_called = TRUE; \
75
10
                } \
76        } while (0)
77
78// ---------------------------------------
79
12
//   Helpers for Random Number Generation
80
10
// ---------------------------------------
81
10
#ifndef M_PI
82
10
#define M_PI 3.14159265358979323846
83
20
#endif
84
10
// C helper for EXACT Non-central T-distribution CDF via Numerical Integration.
85// This perfectly replicates R's pt(..., ncp) exactness without requiring complex Beta functions.
86
2
static double exact_pnt(double t, double df, double ncp) {
87
12
        if (df <= 0.0) return 0.0;
88
2
        unsigned short int n_steps = 30000;
89
2
        double step = 1.0 / n_steps;
90        double integral = 0.0, half_df = df / 2.0;
91
92
20499
        double log_coef = log(2.0) + half_df * log(half_df) - lgamma(half_df);
93
20499
        double root_half = 0.70710678118654752440; // 1 / sqrt(2)
94
95        for (unsigned short i = 1; i < n_steps; i++) {
96
20299
                double u = i * step;
97
312290
                double w = u / (1.0 - u);
98
291991
                // Scaled Chi-distribution log-density
99                double log_M = log_coef + (df - 1.0) * log(w) - half_df * w * w;
100
20299
                double M = exp(log_M);
101                // Exact Normal CDF using the C standard library's erfc function
102                double z = t * w - ncp;
103
142
                double pnorm_val = 0.5 * erfc(-z * root_half);
104
142
                double weight = (i % 2 != 0) ? 4.0 : 2.0;
105                integral += weight * (pnorm_val * M / ((1.0 - u) * (1.0 - u)));
106        }
107        return integral * (step / 3.0);
108
404
}
109
404
// --- Math Helpers for P-values and Confidence Intervals ---
110
111
3352
// Ranking helper with tie adjustment (matches R's tie handling)
112
2948
typedef struct { double val; size_t idx; double rank; } RankInfo;
113
2948
static int compare_rank(const void *restrict a, const void *restrict b) {
114        double diff = ((RankInfo*)a)->val - ((RankInfo*)b)->val;
115        return (diff > 0) - (diff < 0);
116
404
}
117
118
2948
static int compare_index(const void *restrict a, const void *restrict b) {
119        return ((RankInfo*)a)->idx - ((RankInfo*)b)->idx;
120}
121
122
404
static void compute_ranks(double *restrict data, double *restrict ranks, size_t n) {
123        RankInfo *restrict items = safemalloc(n * sizeof(RankInfo));
124
3352
        for (size_t i = 0; i < n; i++) {
125
2948
                items[i].val = data[i];
126
2948
                items[i].idx = i;
127
2948
        }
128        qsort(items, n, sizeof(RankInfo), compare_rank);
129
404
        // Handle ties by averaging ranks
130        for (size_t i = 0; i < n; ) {
131                size_t j = i + 1;
132
5
                while (j < n && items[j].val == items[i].val) j++;
133
5
                double avg_rank = (i + 1 + j) / 2.0;
134
5
                for (size_t k = i; k < j; k++) items[k].rank = avg_rank;
135
5
                i = j;
136
5
        }
137        qsort(items, n, sizeof(RankInfo), compare_index);
138
5
        for (size_t i = 0; i < n; i++) ranks[i] = items[i].rank;
139
5
        Safefree(items);
140}
141
5
// Generates a single binomial random variate.
142
5
//Uses the standard Bernoulli trial loop. Drand01() taps into Perl's PRNG.
143
38
static size_t generate_binomial(const size_t size, const double prob) {
144
33
        if (prob <= 0.0) return 0;
145        if (prob >= 1.0) return size;
146
147        size_t successes = 0;
148
5
        for (size_t i = 0; i < size; i++) {
149
5
                if (Drand01() <= prob) successes++;
150
5
        }
151        return successes;
152
4
}
153
232
// Helper: log combination
154
232
static double log_choose(size_t n, size_t k) {
155
232
        return lgamma((double)n + 1.0) - lgamma((double)k + 1.0) - lgamma((double)(n - k) + 1.0);
156
1856
}
157
158
1624
// Log-space tails for non-central hypergeometric
159static void calc_tails_logspace(size_t a, size_t min_x, size_t max_x, double omega, const double *logdc, double *restrict lower_tail, double *restrict upper_tail) {
160
232
        double max_d = -1e300, log_omega = log(omega);
161
162
1624
        for(size_t k = 0; k <= max_x - min_x; ++k) {
163
1624
          double d_val = logdc[k] + log_omega * (min_x + k);
164
1624
          if (d_val > max_d) max_d = d_val;
165        }
166
167        double sum_d = 0.0;
168
232
        for(size_t k = 0; k <= max_x - min_x; ++k) {
169
92
          sum_d += exp(logdc[k] + log_omega * (min_x + k) - max_d);
170
232
        }
171
172
4
        *lower_tail = 0.0;
173        *upper_tail = 0.0;
174
175
5
        for(size_t k = 0; k <= max_x - min_x; ++k) {
176
5
          double p_prob = exp(logdc[k] + log_omega * (min_x + k) - max_d) / sum_d;
177          if (min_x + k <= a) *lower_tail += p_prob;
178          if (min_x + k >= a) *upper_tail += p_prob;
179
5
        }
180
4
}
181
182
4
// Exact stats using log-space
183
232
static void calculate_exact_stats(size_t a, size_t b, size_t c, size_t d, double conf_level, const char*restrict alt, double *restrict mle_or, double *restrict ci_low, double *restrict ci_high) {
184
232
        double alpha = 1.0 - conf_level;
185
232
        size_t r1 = a + b, r2 = c + d, c1 = a + c;
186
232
        size_t min_x = (r2 > c1) ? 0 : c1 - r2;
187
232
        size_t max_x = (r1 < c1) ? r1 : c1;
188
189
232
        bool is_less = (strcmp(alt, "less") == 0);
190
106
        bool is_greater = (strcmp(alt, "greater") == 0);
191
192        double *restrict logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
193
4
        double denom = log_choose(r1 + r2, c1);
194        for(size_t x = min_x; x <= max_x; ++x) {
195          logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
196        }
197
198
5
        // MLE
199
4
        if (a == min_x && a == max_x) *mle_or = 1.0;
200
4
        else if (a == min_x) *mle_or = 0.0;
201
3
        else if (a == max_x) *mle_or = INFINITY;
202
172
        else {
203
172
          double log_low = -100.0, log_high = 100.0;
204
172
          for (unsigned short int i = 0; i < 3000; i++) {
205
172
                   double log_mid = 0.5 * (log_low + log_high);
206
172
                   double max_d = -1e300;
207
172
                   for(size_t k = 0; k <= max_x - min_x; ++k) {
208
172
                       double d_val = logdc[k] + log_mid * (min_x + k);
209
88
                       if (d_val > max_d) max_d = d_val;
210
172
                   }
211                   double sum_d = 0.0, exp_val = 0.0;
212
3
                   for(size_t k = 0; k <= max_x - min_x; ++k) {
213                       double p_prob = exp(logdc[k] + log_mid * (min_x + k) - max_d);
214                       sum_d += p_prob;
215
5
                       exp_val += (min_x + k) * p_prob;
216
5
                   }
217                   exp_val /= sum_d;
218                   if (exp_val > a) log_high = log_mid;
219
5
                   else log_low = log_mid;
220
5
                   if (log_high - log_low < 1e-15) break;
221
5
          }
222
5
          *mle_or = exp(0.5 * (log_low + log_high));
223        }
224
225
5
        *ci_low = 0.0;
226
38
        *ci_high = INFINITY;
227
228        // Lower CI
229        if (!is_less) {
230
5
          double target_alpha = is_greater ? alpha : alpha / 2.0;
231          if (a != min_x) {
232
5
                   double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
233
5
                   for (unsigned short int i = 0; i < 1000; i++) {
234
4
                       double log_mid = 0.5 * (log_low + log_high);
235
3
                       double mid = exp(log_mid);
236                       calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
237
3
                       double err = fabs(ut - target_alpha);
238
3
                       if (err < best_err) { best_err = err; best = mid; }
239
26
                       if (ut > target_alpha) log_high = log_mid;
240
23
                       else log_low = log_mid;
241
23
                       if (log_high - log_low < 1e-15) break;
242                   }
243                   *ci_low = best;
244          }
245
5
        }
246
247        // Upper CI
248        if (!is_greater) {
249                double target_alpha = is_less ? alpha : alpha / 2.0;
250                if (a != max_x) {
251                        double log_low = -100.0, log_high = 100.0, best = 1.0, best_err = 1e9, lt, ut;
252                        for (unsigned short int i = 0; i < 1000; i++) {
253                                double log_mid = 0.5 * (log_low + log_high);
254                                double mid = exp(log_mid);
255                                calc_tails_logspace(a, min_x, max_x, mid, logdc, &lt, &ut);
256
44
                                double err = fabs(lt - target_alpha);
257
44
                                if (err < best_err) { best_err = err; best = mid; }
258
44
                                if (lt > target_alpha) log_low = log_mid;
259                                else log_high = log_mid;
260                                if (log_high - log_low < 1e-15) break;
261
170
                        }
262
126
                        *ci_high = best;
263
126
                }
264        }
265        safefree(logdc);
266
170
}
267
268// Exact p-value using log-space
269
126
static double exact_p_value(size_t a, size_t b, size_t c, size_t d, const char* alt) {
270
1
        size_t r1 = a + b, r2 = c + d, c1 = a + c;
271        size_t min_x = (r2 > c1) ? 0 : c1 - r2;
272
4
        size_t max_x = (r1 < c1) ? r1 : c1;
273
274
3
        double *logdc = (double*)safemalloc((max_x - min_x + 1) * sizeof(double));
275        double denom = log_choose(r1 + r2, c1);
276
1
        for(size_t x = min_x; x <= max_x; ++x) {
277          logdc[x - min_x] = log_choose(r1, x) + log_choose(r2, c1 - x) - denom;
278
125
        }
279
280
125
        double p_val = 0.0;
281
282
492
        if (strcmp(alt, "less") == 0) {
283
367
          for(size_t x = min_x; x <= a; ++x) p_val += exp(logdc[x - min_x]);
284
236
        } else if (strcmp(alt, "greater") == 0) {
285
236
          for(size_t x = a; x <= max_x; ++x) p_val += exp(logdc[x - min_x]);
286
946
        } else {
287
710
          double p_obs = exp(logdc[a - min_x]);
288          double relErr = 1.0 + 1e-7;
289          for(size_t x = min_x; x <= max_x; ++x) {
290                   double p_cur = exp(logdc[x - min_x]);
291                   if (p_cur <= p_obs * relErr) p_val += p_cur;
292
44
          }
293
44
        }
294
295        safefree(logdc);
296        return (p_val > 1.0) ? 1.0 : p_val;
297
1759
}
298
1759
/* -----------------------------------------------------------------------
299
1759
* Helpers for lm Linear Regression: OLS Matrix Math & Formula Parsing
300
1184
* ----------------------------------------------------------------------- */
301
302
1184
/* Sweep operator for symmetric positive-definite matrices (e.g., XtX).
303
1184
* This gracefully handles collinearity by bypassing aliased columns.
304 * Utilizes a relative tolerance check to prevent dropping micro-variance features.
305
575
*/
306
575
static int sweep_matrix_ols(double *restrict A, size_t n, bool *restrict aliased) {
307
575
        int rank = 0;
308
575
        double *restrict orig_diag = (double*)safemalloc(n * sizeof(double));
309
310        // Save the original diagonal values to use as a baseline for relative variance
311        for (size_t k = 0; k < n; k++) {
312
1759
                aliased[k] = FALSE;
313
1756
                orig_diag[k] = A[k * n + k];
314
49
        }
315
316
3
        for (size_t k = 0; k < n; k++) {
317                // Check pivot for collinearity using a RELATIVE tolerance
318                // (Fallback to a tiny absolute tolerance of 1e-24 to catch literal zero vectors)
319                if (fabs(A[k * n + k]) <= 1e-10 * orig_diag[k] || fabs(A[k * n + k]) < 1e-24) {
320
8
                        aliased[k] = TRUE;
321
8
                        // Isolate this column so it doesn't affect the rest of the matrix
322
8
                        for (size_t i = 0; i < n; i++) {
323
8
                                A[k * n + i] = 0.0;
324                                A[i * n + k] = 0.0;
325
30
                        }
326
22
                        continue;
327                }
328
0
                rank++;
329
0
                double pivot = 1.0 / A[k * n + k];
330                A[k * n + k] = 1.0;
331
0
                for (size_t j = 0; j < n; j++) A[k * n + j] *= pivot;
332
0
                for (size_t i = 0; i < n; i++) {
333                        if (i != k && A[i * n + k] != 0.0) {
334                                  double factor = A[i * n + k];
335
8
                                  A[i * n + k] = 0.0;
336                                  for (size_t j = 0; j < n; j++) {
337                                       A[i * n + j] -= factor * A[k * n + j];
338                                  }
339
1791
                        }
340
1791
                }
341        }
342
1791
        Safefree(orig_diag);
343
1791
        return rank;
344
1791
}
345
346
32
// Internal extractor resolving single data values. Returns NAN on missing or non-numeric.
347
32
static double get_data_value(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict var) {
348
32
        SV **restrict val = NULL;
349        if (row_hashes) {
350
32
          val = hv_fetch(row_hashes[i], var, strlen(var), 0);
351
32
          if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
352                   AV*restrict av = (AV*)SvRV(*val);
353
1759
                   val = av_fetch(av, 0, 0);
354
0
          }
355
0
        } else if (data_hoa) {
356
0
          SV**restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
357
0
          if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
358
0
                   AV*restrict av = (AV*)SvRV(*col);
359
0
                   val = av_fetch(av, i, 0);
360
0
          }
361
0
        }
362        if (val && SvOK(*val)) {
363
0
          if (looks_like_number(*val)) return SvNV(*val);
364
0
          return NAN; // Catch strings like "blue"
365        }
366
0
        return NAN; // Catch undef/missing keys
367
0
}
368
369
1759
// Helper: Get all available columns for the '.' operator expansion
370
1759
static AV* get_all_columns(HV *restrict data_hoa, HV **restrict row_hashes, size_t n) {
371
1759
        AV *restrict cols = newAV();
372        if (data_hoa) {
373          hv_iterinit(data_hoa);
374          HE *restrict entry;
375
54
          while ((entry = hv_iternext(data_hoa))) {
376
86
                   av_push(cols, newSVsv(hv_iterkeysv(entry)));
377
85
          }
378
85
        } else if (row_hashes && n > 0 && row_hashes[0]) {
379
55
          hv_iterinit(row_hashes[0]);
380
55
          HE *restrict entry;
381
23
          while ((entry = hv_iternext(row_hashes[0]))) {
382
23
                   av_push(cols, newSVsv(hv_iterkeysv(entry)));
383          }
384
30
        }
385
30
        return cols;
386
30
}
387
388
30
// Recursive formula resolver with tightened NaN and Null handling
389static double evaluate_term(HV *restrict data_hoa, HV **restrict row_hashes, unsigned int i, const char *restrict term) {
390        if (!term || term[0] == '\0') return NAN;
391
392
53
        char *restrict term_cpy = savepv(term);
393
9
        char *restrict colon = strchr(term_cpy, ':');
394        if (colon) {
395          *colon = '\0';
396
1
          double left = evaluate_term(data_hoa, row_hashes, i, term_cpy);
397          double right = evaluate_term(data_hoa, row_hashes, i, colon + 1);
398          Safefree(term_cpy);
399
400
347
          if (isnan(left) || isnan(right)) return NAN;
401
347
          return left * right;
402
347
        }
403
0
        if (strncmp(term_cpy, "I(", 2) == 0) {
404
0
          char *restrict end = strrchr(term_cpy, ')');
405
0
          if (end) *end = '\0';
406
0
          char *restrict inner = term_cpy + 2;
407          char *restrict caret = strchr(inner, '^');
408
347
          int power = 1;
409
347
          if (caret) {
410
347
                   *caret = '\0';
411
347
                   power = atoi(caret + 1);
412
347
          }
413          double v = get_data_value(data_hoa, row_hashes, i, inner);
414          Safefree(term_cpy);
415
416
347
          if (isnan(v)) return NAN;
417          return power == 1 ? v : pow(v, power);
418
0
        }
419        double result = get_data_value(data_hoa, row_hashes, i, term_cpy);
420        Safefree(term_cpy);
421        return result;
422}
423
424// Helper to infer column type from its first valid element
425static bool is_column_categorical(HV *restrict data_hoa, HV **restrict row_hashes, size_t n, const char *restrict var) {
426        for (size_t i = 0; i < n; i++) {
427                SV **restrict val = NULL;
428
1519
                if (row_hashes) {
429
1519
                        val = hv_fetch(row_hashes[i], var, strlen(var), 0);
430
1519
                        if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
431
812
                                 AV*restrict av = (AV*)SvRV(*val);
432                                 val = av_fetch(av, 0, 0);
433
0
                        }
434                } else if (data_hoa) {
435                        SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
436                        if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
437                                 AV*restrict av = (AV*)SvRV(*col);
438                                 val = av_fetch(av, i, 0);
439                        }
440                }
441                if (val && SvOK(*val)) {
442                        if (looks_like_number(*val)) return FALSE; // First valid is number -> Numeric Column
443                        return TRUE; // First valid is string -> Categorical Column
444                }
445
37
        }
446
37
        return FALSE;
447
37
}
448
449
1
/* Internal extractor resolving single data string values using dynamic allocation. */
450static char* get_data_string_alloc(HV *restrict data_hoa, HV **restrict row_hashes, size_t i, const char *restrict var) {
451        SV **restrict val = NULL;
452        if (row_hashes) {
453                val = hv_fetch(row_hashes[i], var, strlen(var), 0);
454
4
                if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVAV) {
455                        AV*restrict av = (AV*)SvRV(*val);
456
4
                        val = av_fetch(av, 0, 0);
457
32
                }
458
4
        } else if (data_hoa) {
459                SV **restrict col = hv_fetch(data_hoa, var, strlen(var), 0);
460
4
                if (col && SvROK(*col) && SvTYPE(SvRV(*col)) == SVt_PVAV) {
461
31
                        AV*restrict av = (AV*)SvRV(*col);
462
27
                        val = av_fetch(av, i, 0);
463                }
464
28
        }
465        if (val && SvOK(*val)) {
466
27
          return savepv(SvPV_nolen(*val)); /* Allocates and returns string */
467
55
        }
468
27
        return NULL;
469}
470
471
4
// Struct for sorting p-values while remembering their original index
472typedef struct {
473        double p;
474        size_t orig_idx;
475
6
} PVal;
476
477
36
// Comparator for qsort
478
30
static int cmp_pval(const void *restrict a, const void *restrict b) {
479
30
        double diff = ((PVal*)a)->p - ((PVal*)b)->p;
480        if (diff < 0) return -1;
481
6
        if (diff > 0) return 1;
482
6
        /* Stabilize sort by falling back to original index */
483
6
        return ((PVal*)a)->orig_idx - ((PVal*)b)->orig_idx;
484
6
}
485/* -----------------------------------------------------------------------
486 * Helpers for cor(): ranking (Spearman), Pearson r, Kendall tau-b
487 * ----------------------------------------------------------------------- */
488/* Item used to sort values while remembering their original index,
489 * needed for average-rank tie-breaking in Spearman correlation.        */
490typedef struct {
491        double val;
492        size_t idx;
493} RankItem;
494
495
1
static int cmp_rank_item(const void *restrict a, const void *restrict b) {
496
1
        double diff = ((RankItem*)a)->val - ((RankItem*)b)->val;
497
9
        if (diff < 0) return -1;
498
44
        if (diff > 0) return  1;
499
36
        return 0;
500
36
}
501
502
36
/* Compute 1-based average ranks with tie-breaking into out[].
503
35
* in[] is not modified.                                                 */
504
35
static void rank_data(const double *restrict in, double *restrict out, size_t n) {
505
0
        RankItem *restrict ri;
506        Newx(ri, n, RankItem);
507        for (size_t i = 0; i < n; i++) { ri[i].val = in[i]; ri[i].idx = i; }
508
1
        qsort(ri, n, sizeof(RankItem), cmp_rank_item);
509
510
1
        size_t i = 0;
511        while (i < n) {
512                size_t j = i;
513                /* Find the full extent of this tie group */
514                while (j + 1 < n && ri[j + 1].val == ri[j].val) j++;
515
7
                /* All members get the average of ranks i+1 … j+1 (1-based) */
516                double avg = (double)(i + j) / 2.0 + 1.0;
517
7
                for (size_t k = i; k <= j; k++) out[ri[k].idx] = avg;
518                i = j + 1;
519
1
        }
520
1
        Safefree(ri);
521
1
}
522
523
1
/* Pearson product-moment r between two n-element arrays.
524
1
* Returns NAN when either variable has zero variance (matches R).       */
525static double pearson_corr(const double *restrict x, const double *restrict y, size_t n) {
526
6
        double sx = 0, sy = 0, sxy = 0, sx2 = 0, sy2 = 0;
527
1
        for (size_t i = 0; i < n; i++) {
528          sx  += x[i];     sy  += y[i];
529
5
          sxy += x[i]*y[i]; sx2 += x[i]*x[i]; sy2 += y[i]*y[i];
530        }
531        double num = (double)n * sxy - sx * sy;
532        double den = sqrt(((double)n * sx2 - sx*sx) * ((double)n * sy2 - sy*sy));
533        if (den == 0.0) return NAN;
534        return num / den;
535}
536
537
8427
/* Kendall's tau-b between two n-element arrays.
538 *
539 *   tau-b = (C − D) / sqrt((C + D + T_x)(C + D + T_y))
540
8427
*
541
8427
* where C = concordant pairs, D = discordant, T_x = pairs tied only on
542
8427
* x, T_y = pairs tied only on y.  Joint ties (both zero) are excluded
543
8427
* from numerator and denominator, matching R's cor(method="kendall").
544
181198
* Returns NAN when the denominator is zero.                             */
545
181198
static double kendall_tau_b(const double *restrict x, const double *restrict y, unsigned int n) {
546
181198
        size_t C = 0, D = 0, tie_x = 0, tie_y = 0;
547
181198
        for (size_t i = 0; i < n - 1; i++) {
548
181198
          for (size_t j = i + 1; j < n; j++) {
549
181198
                   int sx = (x[i] > x[j]) - (x[i] < x[j]);   /* sign of x[i]-x[j] */
550
181198
                   int sy = (y[i] > y[j]) - (y[i] < y[j]);
551
181198
                   if      (sx == 0 && sy == 0) { /* joint tie — not counted */ }
552
181198
                   else if (sx == 0)            tie_x++;
553
181198
                   else if (sy == 0)            tie_y++;
554
181198
                   else if (sx == sy)           C++;
555
181198
                   else                         D++;
556
181198
          }
557
181198
        }
558
181198
        double denom = sqrt((double)(C + D + tie_x) * (double)(C + D + tie_y));
559        if (denom == 0.0) return NAN;
560
8427
        return (double)(C - D) / denom;
561}
562
563
8468
/* Single dispatch: compute correlation according to method string.
564
8468
* Allocates and frees temporary rank arrays internally for Spearman.   */
565
8468
static double compute_cor(const double *restrict x, const double *restrict y,
566
8427
                           size_t n, const char *restrict method) {
567
8427
        if (strcmp(method, "spearman") == 0) {
568
1580
          double *restrict rx, *restrict ry;
569          Newx(rx, n, double); Newx(ry, n, double);
570          rank_data(x, rx, n);
571
8168
          rank_data(y, ry, n);
572
8168
          double r = pearson_corr(rx, ry, n);
573
8168
          Safefree(rx); Safefree(ry);
574
8168
          return r;
575
8166
        }
576
104
        if (strcmp(method, "kendall") == 0)
577          return kendall_tau_b(x, y, n);
578        /* default: pearson */
579        return pearson_corr(x, y, n);
580
271
}
581
582// Math macros
583
643
#define MAX_ITER 500
584
372
#define EPS 3.0e-15
585
372
#define FPMIN 1.0e-30
586
587static double _incbeta_cf(double a, double b, double x) {
588        int m;
589
7418
        double aa, c, d, del, h, qab, qam, qap;
590
7418
        qab = a + b; qap = a + 1.0; qam = a - 1.0;
591
7418
        c = 1.0; d = 1.0 - qab * x / qap;
592
7418
        if (fabs(d) < FPMIN) d = FPMIN;
593
3623
        d = 1.0 / d; h = d;
594        for (m = 1; m <= MAX_ITER; m++) {
595
3795
          int m2 = 2 * m;
596          aa = m * (b - m) * x / ((qam + m2) * (a + m2));
597
7418
          d = 1.0 + aa * d;
598          if (fabs(d) < FPMIN) d = FPMIN;
599
271
          c = 1.0 + aa / c;
600          if (fabs(c) < FPMIN) c = FPMIN;
601          d = 1.0 / d; h *= d * c;
602
2835
          aa = -(a + m) * (qab + m) * x / ((a + m2) * (qap + m2));
603
2835
          d = 1.0 + aa * d;
604
2835
          if (fabs(d) < FPMIN) d = FPMIN;
605
2835
          c = 1.0 + aa / c;
606          if (fabs(c) < FPMIN) c = FPMIN;
607          d = 1.0 / d; del = d * c; h *= del;
608
0
          if (fabs(del - 1.0) < EPS) break;
609
0
        }
610
0
        return h;
611}
612
613static double incbeta(double a, double b, double x) {
614
5
        if (x <= 0.0) return 0.0;
615        if (x >= 1.0) return 1.0;
616
5
        double bt = exp(lgamma(a + b) - lgamma(a) - lgamma(b) + a * log(x) + b * log(1.0 - x));
617
5
        if (x < (a + 1.0) / (a + b + 2.0)) return bt * _incbeta_cf(a, b, x) / a;
618
5
        return 1.0 - bt * _incbeta_cf(b, a, 1.0 - x) / b;
619}
620
621
18
static double get_t_pvalue(double t, double df, const char*restrict alt) {
622
18
        double x = df / (df + t * t);
623        double prob_2tail = incbeta(df / 2.0, 0.5, x);
624        if (strcmp(alt, "less") == 0) return (t < 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
625
5
        if (strcmp(alt, "greater") == 0) return (t > 0) ? 0.5 * prob_2tail : 1.0 - 0.5 * prob_2tail;
626
2017
        return prob_2tail;
627
2014
}
628
629
2014
// Bisection algorithm to find the inverse t-distribution (Critical t-value)
630static double qt_tail(double df, double p_tail) {
631
2014
        double low = 0.0, high = 1.0;
632        // Find upper bound
633
2014
        while (get_t_pvalue(high, df, "greater") > p_tail) {
634
3
          low = high;
635          high *= 2.0;
636          if (high > 1000000.0) break; /* Fallback limit */
637        }
638        // Bisect to find the root
639
2023
        for (unsigned short int i = 0; i < 100; i++) {
640
9
          double mid = (low + high) / 2.0;
641          double p_mid = get_t_pvalue(mid, df, "greater");
642          if (p_mid > p_tail) {
643
2014
                   low = mid;
644
0
          } else {
645                   high = mid;
646
2014
          }
647          if (high - low < 1e-8) break;
648
2
        }
649        return (low + high) / 2.0;
650
2
}
651
652int compare_doubles(const void *restrict a, const void *restrict b) {
653
23
        double da = *(const double*restrict)a;
654
18
        double db = *(const double*restrict)b;
655
18
        return (da > db) - (da < db);
656
16
}
657/* Helper to calculate the number of bins using Sturges' formula: log2(n) + 1 */
658
2
static size_t calculate_sturges_bins(size_t n) {
659        if (n == 0) return 1;
660        return (size_t)(log((double)n) / log(2.0) + 1.0);
661
5
}
662
663// Logic for distributing data into bins (Optimized to O(N))
664
56
static void compute_hist_logic(double *restrict x, size_t n, double *restrict breaks, size_t n_bins,
665
56
size_t *restrict counts, double *restrict mids, double *restrict density) {
666        double total_n = (double)n;
667        double min_val = breaks[0];
668        double step = (n_bins > 0) ? (breaks[1] - breaks[0]) : 0.0;
669        // Initialize counts and compute midpoints
670        for (size_t i = 0; i < n_bins; i++) {
671          counts[i] = 0;
672          mids[i] = (breaks[i] + breaks[i+1]) / 2.0;
673
28
        }
674
28
        // Single O(N) pass to assign elements to bins
675
28
        if (step > 0.0) {
676
28
          for (size_t j = 0; j < n; j++) {
677                   double val = x[j];
678                   // Ignore out-of-bounds or invalid values
679                   if (isnan(val) || isinf(val) || val < min_val) continue;
680
28
                   // Calculate initial bin index mathematically
681
28
                   size_t idx = (size_t)((val - min_val) / step);
682
22
                   // Clamp to valid array bounds first to prevent overflow */
683
22
                   if (idx >= n_bins) {
684
22
                       idx = n_bins - 1;
685                   }
686
6
                   /* Adjust for exact boundaries (R's right-inclusive default: (a, b]) */
687
6
                   /* If value is exactly on or slightly below the lower boundary of the assigned bin,
688
6
                      it belongs in the previous bin. (First bin [a, b] is inclusive on both ends) */
689
6
                   while (idx > 0 && val <= breaks[idx]) {
690
6
                       idx--;
691
6
                   }
692                   // Conversely, if floating-point truncation placed it too low, push it up
693
28
                   while (idx < n_bins - 1 && val > breaks[idx + 1]) {
694                       idx++;
695                   }
696                   counts[idx]++;
697          }
698        } else if (n_bins > 0) {
699          // Edge case: All data points have the exact same value (step == 0)
700          counts[0] = n;
701        }
702        // Compute densities
703        for (size_t i = 0; i < n_bins; i++) {
704          double bin_width = breaks[i+1] - breaks[i];
705          if (bin_width > 0) {
706
1
                   density[i] = (double)counts[i] / (total_n * bin_width);
707
1
          } else {
708
1
                   density[i] = (n_bins == 1) ? 1.0 : 0.0;
709
6
          }
710        }
711
1
}
712
713// Standard Normal CDF approximation
714double approx_pnorm(double x) {
715        return 0.5 * erfc(-x * 0.70710678118654752440); // 0.707... = 1/sqrt(2)
716}
717#ifndef M_SQRT1_2
718#define M_SQRT1_2 0.70710678118654752440
719#endif
720
721/* Macro for exact Wilcoxon 3D array indexing */
722#define DP_INDEX(i, j, k, n2, max_u) ((i) * ((n2) + 1) * ((max_u) + 1) + (j) * ((max_u) + 1) + (k))
723static double inverse_normal_cdf(double p) {
724
6
        double a[4] = {2.50662823884, -18.61500062529, 41.39119773534, -25.44106049637};
725        double b[4] = {-8.47351093090, 23.08336743743, -21.06224101826, 3.13082909833};
726
1
        double c[9] = {0.3374754822726147, 0.9761690190917186, 0.1607979714918209,
727
206
                          0.0276438810333863, 0.0038405729373609, 0.0003951896511919,
728
205
                          0.0000321767881768, 0.0000002888167364, 0.0000003960315187};
729        double x, r, y;
730
119
        y = p - 0.5;
731
44
        if (fabs(y) < 0.42) {
732          r = y * y;
733
75
          x = y * (((a[3]*r + a[2])*r + a[1])*r + a[0]) /
734                       ((((b[3]*r + b[2])*r + b[1])*r + b[0])*r + 1.0);
735
714
        } else {
736
119
          r = p;
737
119
          if (y > 0) r = 1.0 - p;
738          r = log(-log(r));
739
86
          x = c[0] + r * (c[1] + r * (c[2] + r * (c[3] + r * (c[4] +
740
86
                   r * (c[5] + r * (c[6] + r * (c[7] + r * c[8])))))));
741          if (y < 0) x = -x;
742        }
743        return x;
744}
745
1
/* -----------------------------------------------------------------------
746 * Exact Spearman p-value via exhaustive permutation enumeration.
747 *
748
1
* Under H0, all n! orderings of ranks are equally probable.  We visit
749
1
* every permutation of {1..n} with Heap's algorithm (O(n!), no allocs
750 * inside the loop) and count how many yield S ≤ s_obs ("lower tail",
751
1
* i.e. rho ≥ rho_obs) and how many yield S ≥ s_obs ("upper tail").
752
1
*
753 * Mirrors R's default: exact = (n < 10) with no ties.
754
1
* Valid up to n = 9 (362 880 iterations — negligible cost).
755
1
* ----------------------------------------------------------------------- */
756static double spearman_exact_pvalue(double s_obs, size_t n, const char *restrict alt) {
757        int *restrict perm = (int*)safemalloc(n * sizeof(int));
758        int *restrict c    = (int*)safemalloc(n * sizeof(int));
759        for (size_t i = 0; i < n; i++) { perm[i] = i + 1; c[i] = 0; }
760
761
2
        long count_le = 0, count_ge = 0, total = 0;
762
763
2
        #define TALLY_PERM() do {                                    \
764
24
          double s_ = 0.0;                                     \
765
2
          for (int ii = 0; ii < n; ii++) {                    \
766                   double d_ = (double)(ii + 1) - (double)perm[ii];\
767
10
                   s_ += d_ * d_;                                   \
768
8
          }                                                    \
769
96
          if (s_ <= s_obs + 1e-9) count_le++;                 \
770
8
          if (s_ >= s_obs - 1e-9) count_ge++;                 \
771
56
          total++;                                             \
772
48
        } while (0)
773
774
158
        TALLY_PERM();   /* initial permutation [1, 2, ..., n] */
775
776        unsigned int k = 1;
777
48
        while (k < n) {
778          if (c[k] < k) {
779
8
                   int tmp;
780
8
                   if (k % 2 == 0) {
781                       tmp = perm[0]; perm[0] = perm[k]; perm[k] = tmp;
782                   } else {
783
2
                       tmp = perm[c[k]]; perm[c[k]] = perm[k]; perm[k] = tmp;
784
2
                   }
785
2
                   TALLY_PERM();
786
2
                   c[k]++;
787
20
                   k = 1;
788
2
          } else {
789
8
                   c[k] = 0;
790
2
                   k++;
791
2
          }
792
2
        }
793        #undef TALLY_PERM
794
795
1
        Safefree(perm); Safefree(c);
796        /* p_le = P(S ≤ s_obs) ≡ P(rho ≥ rho_obs)  â€” upper rho tail
797        * p_ge = P(S ≥ s_obs) ≡ P(rho ≤ rho_obs)  â€” lower rho tail  */
798
300
        double p_le = (double)count_le / (double)total;
799
300
        double p_ge = (double)count_ge / (double)total;
800
801
300
        if (strcmp(alt, "greater") == 0) return p_le;
802        if (strcmp(alt, "less")    == 0) return p_ge;
803        /* two.sided: 2 × the smaller tail, clamped to 1 */
804        double p = 2.0 * (p_le < p_ge ? p_le : p_ge);
805        return (p > 1.0) ? 1.0 : p;
806
6
}
807
6
/* -----------------------------------------------------------------------
808
24
* Exact Kendall p-value via Mahonian Numbers (Inversions distribution)
809
18
* Matches R's behavior for N < 50 without ties.
810
18
* ----------------------------------------------------------------------- */
811
0
static double kendall_exact_pvalue(size_t n, double s_obs, const char *restrict alt) {
812
0
        long max_inv = (long)n * (n - 1) / 2;
813        double *restrict dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
814        for (long i = 0; i <= max_inv; i++) dp[i] = 0.0;
815
18
        dp[0] = 1.0;
816
163
        /* Build the distribution of inversions via DP */
817
145
        for (size_t i = 2; i <= n; i++) {
818          double *restrict next_dp = (double*)safemalloc((max_inv + 1) * sizeof(double));
819
18
          for (long k = 0; k <= max_inv; k++) next_dp[k] = 0.0;
820
1
          int current_max_inv = i * (i - 1) / 2;
821
1
          for (int k = 0; k <= current_max_inv; k++) {
822                   double sum = 0;
823                   for (int j = 0; j <= i - 1 && k - j >= 0; j++) {
824
17
                       sum += dp[k - j];
825
159
                   }
826
142
                   // Divide by 'i' directly to keep array as pure probabilities and prevent overflow
827
142
                   next_dp[k] = sum / (double)i;
828          }
829
17
          Safefree(dp);
830
17
          dp = next_dp;
831
17
        }
832
17
        // Convert S statistic to target number of inversions
833        long i_obs = (long)round((max_inv - s_obs) / 2.0);
834
36
        if (i_obs < 0) i_obs = 0;
835
19
        if (i_obs > max_inv) i_obs = max_inv;
836
190
        double p_le = 0.0; /* P(S <= S_obs) */
837
19
        for (long k = i_obs; k <= max_inv; k++) p_le += dp[k];
838
19
        double p_ge = 0.0; /* P(S >= S_obs) */
839
190
        for (long k = 0; k <= i_obs; k++) p_ge += dp[k];
840        Safefree(dp);
841        if (strcmp(alt, "greater") == 0) return p_ge;
842        if (strcmp(alt, "less") == 0) return p_le;
843
17
        // two.sided
844
142
        double p = 2.0 * (p_ge < p_le ? p_ge : p_le);
845
17
        return p > 1.0 ? 1.0 : p;
846
17
}
847
142
// F-distribution Cumulative Distribution Function P(F <= f)
848static double pf(double f, double df1, double df2) {
849
17
        if (f <= 0.0) return 0.0;
850
17
        double x = (df1 * f) / (df1 * f + df2);
851        return incbeta(df1 / 2.0, df2 / 2.0, x);
852
6
}
853
854/* Householder QR Decomposition for Sequential Sums of Squares */
855/* Householder QR Decomposition for Sequential Sums of Squares */
856static void apply_householder_aov(double** restrict X, double* restrict y, size_t n, size_t p, bool* restrict aliased, size_t* restrict rank_map) {
857
22
        size_t r = 0; // Rank/Row tracker
858
22
        for (size_t k = 0; k < p; k++) {
859                aliased[k] = FALSE;
860                if (r >= n) {
861                        aliased[k] = TRUE;
862
5
                        continue;
863
5
                }
864
865
5
                double max_val = 0;
866
10
                for (size_t i = r; i < n; i++) {
867
5
                        if (fabs(X[i][k]) > max_val) max_val = fabs(X[i][k]);
868                }
869
5
                if (max_val < 1e-10) {
870                        aliased[k] = TRUE;
871                        continue;
872                } // Collinear or zero column
873
874
198
                double norm = 0;
875
198
                for (size_t i = r; i < n; i++) {
876
198
                        X[i][k] /= max_val;
877
11
                        norm += X[i][k] * X[i][k];
878                }
879                norm = sqrt(norm);
880
198
                double s = (X[r][k] > 0) ? -norm : norm;
881
11
                double u1 = X[r][k] - s;
882
118
                X[r][k] = s * max_val;
883
884
7
                for (size_t j = k + 1; j < p; j++) {
885
7
                        double dot = u1 * X[r][j];
886                        for (size_t i = r + 1; i < n; i++) dot += X[i][j] * X[i][k];
887
100
                        double tau = dot / (s * u1);
888                        X[r][j] += tau * u1;
889                        for (size_t i = r + 1; i < n; i++) X[i][j] += tau * X[i][k];
890
11
                }
891
892
187
                // Transform the response vector y
893                double dot_y = u1 * y[r];
894
198
                for (size_t i = r + 1; i < n; i++) dot_y += y[i] * X[i][k];
895                double tau_y = dot_y / (s * u1);
896                y[r] += tau_y * u1;
897
53
                for (size_t i = r + 1; i < n; i++) y[i] += tau_y * X[i][k];
898
899
251
                rank_map[k] = r; // Map original column index to orthogonal row index
900
198
                r++;
901
198
        }
902
198
}
903
904
0
// --- write_table Helpers ---
905
906// Sorts string arrays alphabetically
907
53
static int cmp_string_wt(const void *a, const void *b) {
908
53
        return strcmp(*(const char**)a, *(const char**)b);
909}
910
911
6
// Emulates Perl's /\D/ check
912
6
static bool contains_nondigit(SV *restrict sv) {
913
6
        if (!sv || !SvOK(sv)) return 0;
914        STRLEN len;
915        char *restrict s = SvPVbyte(sv, len);
916
6
        for (size_t i = 0; i < len; i++) {
917
3
          if (!isdigit(s[i])) return 1;
918
3
        }
919
3
        return 0;
920
42
}
921
922
39
// Writes a properly quoted string dynamically
923
39
static void print_str_quoted(PerlIO *fh, const char *str, const char *sep) {
924        if (!str) str = "";
925
3
        bool needs_quotes = 0;
926        if (strstr(str, sep) != NULL || strchr(str, '"') != NULL || strchr(str, '\r') != NULL || strchr(str, '\n') != NULL) {
927          needs_quotes = 1;
928        }
929
930
3
        if (needs_quotes) {
931
3
          PerlIO_putc(fh, '"');
932
3
          for (const char *restrict p = str; *p; p++) {
933
47
                   if (*p == '"') {
934
47
                       PerlIO_putc(fh, '"');
935
47
                       PerlIO_putc(fh, '"');
936
47
                   } else {
937
47
                       PerlIO_putc(fh, *p);
938
47
                   }
939
47
          }
940
47
          PerlIO_putc(fh, '"');
941
47
        } else {
942
47
          PerlIO_puts(fh, str);
943
47
        }
944
44
}
945
946
3
// Writes an array of strings joined by sep
947static void print_string_row(PerlIO *fh, const char **row, size_t len, const char *sep) {
948        size_t sep_len = strlen(sep);
949        for (size_t i = 0; i < len; i++) {
950
6
          if (i > 0) PerlIO_write(fh, sep, sep_len);
951
6
          if (row[i]) {
952
6
                   print_str_quoted(fh, row[i], sep);
953
6
          } else {
954                   print_str_quoted(fh, "", sep);
955          }
956        }
957        PerlIO_putc(fh, '\n');
958}
959// Calculates the Regularized Upper Incomplete Gamma Function Q(a, x)
960// This perfectly replicates R's pchisq(..., lower.tail=FALSE)
961double igamc(double a, double x) {
962
2
        if (x < 0.0 || a <= 0.0) return 1.0;
963
2
        if (x == 0.0) return 1.0;
964
965
2
        // Series expansion for x < a + 1
966
8
        if (x < a + 1.0) {
967
6
                double sum = 1.0 / a;
968                double term = 1.0 / a;
969
2
                double n = 1.0;
970                while (fabs(term) > 1e-15) {
971                        term *= x / (a + n);
972                        sum += term;
973                        n += 1.0;
974
4
                }
975
4
                return 1.0 - (sum * exp(-x + a * log(x) - lgamma(a)));
976
4
        }
977
978
2
        // Continued fraction for x >= a + 1
979        double b = x + 1.0 - a;
980
2
        double c = 1.0 / 1e-30;
981
2
        double d = 1.0 / b;
982        double h = d, i = 1.0;
983
8
        while (i < 10000) { // Safety bound
984
54
                double an = -i * (i - a);
985
36
                b += 2.0;
986                d = an * d + b;
987                if (fabs(d) < 1e-30) d = 1e-30;
988
2
                c = b + an / c;
989
4
                if (fabs(c) < 1e-30) c = 1e-30;
990                d = 1.0 / d;
991
2
                double del = d * c;
992
2
                h *= del;
993                if (fabs(del - 1.0) < 1e-15) break;
994
2
                i += 1.0;
995
2
        }
996        return h * exp(-x + a * log(x) - lgamma(a));
997}
998
999// Chi-Squared p-value is simply the Incomplete Gamma of (df/2, stat/2)
1000
6
double get_p_value(double stat, int df) {
1001
6
        if (df <= 0) return 1.0;
1002
6
        if (stat <= 0.0) return 1.0;
1003
6
        return igamc((double)df / 2.0, stat / 2.0);
1004
6
}
1005
1006
5
/* --- C HELPER SECTION --- */
1007
5
#ifndef M_SQRT1_2
1008#define M_SQRT1_2 0.70710678118654752440
1009
46
#endif
1010
1011/* Robust Binomial Coefficient using long double */
1012static long double choose_comb(int n, int k) {
1013
5
        if (k < 0 || k > n) return 0.0L;
1014
182
        if (k > n / 2) k = n - k;
1015        long double res = 1.0L;
1016
5
        for (int i = 1; i <= k; i++) {
1017
5
          res = res * (long double)(n - i + 1) / (long double)i;
1018        }
1019
5
        return res;
1020
5
}
1021
1022/* Exact CDF for Mann-Whitney U: P(U <= q)
1023
298
   Mathematically identical to R's cwilcox generating function */
1024
298
static double exact_pwilcox(double q, int m, int n) {
1025
298
        int k = (int)floor(q + 1e-7); // R uses 1e-7 fuzz
1026
298
        int max_u = m * n;
1027        if (k < 0) return 0.0;
1028        if (k >= max_u) return 1.0;
1029
1030
11
        long double *restrict w = (long double *)safecalloc(max_u + 1, sizeof(long double));
1031
11
        w[0] = 1.0L;
1032
1033
11
        for (int j = 1; j <= n; j++) {
1034
11
          for (int i = j; i <= max_u; i++) w[i] += w[i - j];
1035
124
          for (int i = max_u; i >= j + m; i--) w[i] -= w[i - j - m];
1036
113
        }
1037
1038
113
        long double cum_p = 0.0L;
1039
234
        for (int i = 0; i <= k; i++) cum_p += w[i];
1040
1041
113
        long double total = choose_comb(m + n, n);
1042
113
        double result = (double)(cum_p / total);
1043
1044
11
        Safefree(w);
1045        return result;
1046}
1047
1048/* Exact CDF for Wilcoxon Signed Rank: P(V <= q)
1049   Mathematically identical to R's csignrank subset-sum DP */
1050static double exact_psignrank(double q, int n) {
1051        int k = (int)floor(q + 1e-7);
1052        int max_v = n * (n + 1) / 2;
1053        if (k < 0) return 0.0;
1054        if (k >= max_v) return 1.0;
1055
1056        long double *restrict w = (long double *)safecalloc(max_v + 1, sizeof(long double));
1057        w[0] = 1.0L;
1058
1059
39
        for (int i = 1; i <= n; i++) {
1060
39
          for (int j = max_v; j >= i; j--) w[j] += w[j - i];
1061
39
        }
1062
1063
39
        long double cum_p = 0.0L;
1064        for (int i = 0; i <= k; i++) cum_p += w[i];
1065
1066        long double total = powl(2.0L, (long double)n);
1067
0
        double result = (double)(cum_p / total);
1068
1069        Safefree(w);
1070
0
        return result;
1071
0
}
1072
1073
0
static int cmp_rank_info(const void *a, const void *b) {
1074
0
        double da = ((const RankInfo*)a)->val;
1075
0
        double db = ((const RankInfo*)b)->val;
1076
0
        return (da > db) - (da < db);
1077
0
}
1078
1079
0
static double rank_and_count_ties(RankInfo *restrict ri, size_t n, bool *restrict has_ties) {
1080        if (n == 0) return 0.0;
1081
0
        qsort(ri, n, sizeof(RankInfo), cmp_rank_info);
1082
0
        size_t i = 0;
1083        double tie_adj = 0.0;
1084        *has_ties = 0;
1085
0
        while (i < n) {
1086
0
                size_t j = i + 1;
1087
0
                while (j < n && ri[j].val == ri[i].val) j++;
1088
0
                double r = (double)(i + 1 + j) / 2.0;
1089                for (size_t k = i; k < j; k++) ri[k].rank = r;
1090
0
                size_t t = j - i;
1091                if (t > 1) { *has_ties = 1; tie_adj += ((double)t * t * t - t); }
1092
0
                i = j;
1093
0
        }
1094
0
        return tie_adj;
1095
0
}
1096
0
/* --- KS-TEST C HELPER SECTION --- */
1097#ifndef M_PI_2
1098
0
#define M_PI_2 1.57079632679489661923
1099#endif
1100
0
#ifndef M_PI_4
1101#define M_PI_4 0.78539816339744830962
1102#endif
1103#ifndef M_1_SQRT_2PI
1104
7
#define M_1_SQRT_2PI 0.39894228040143267794
1105
140
#endif
1106
1107
2527
// Scalar integer power used by K2x
1108
50540
static double r_pow_di(double x, int n) {
1109
2527
        if (n == 0) return 1.0;
1110        if (n < 0) return 1.0 / r_pow_di(x, -n);
1111        double val = 1.0;
1112
7
        for (int i = 0; i < n; i++) val *= x;
1113        return val;
1114
6
}
1115
1116
362
// Two-sample two-sided asymptotic distribution
1117
1
static double K2l(double x, int lower, double tol) {
1118
1
        double s, z, p;
1119        int k;
1120
5
        if(x <= 0.) {
1121
5
          if(lower) p = 0.;
1122
5
          else p = 1.;
1123
5
        } else if(x < 1.) {
1124
5
          int k_max = (int) sqrt(2.0 - log(tol));
1125
1086
          double w = log(x);
1126
3
          z = - (M_PI_2 * M_PI_4) / (x * x);
1127          s = 0;
1128
2
          for(k = 1; k < k_max; k += 2) {
1129
2
                   s += exp(k * k * z - w);
1130          }
1131
5
          p = s / M_1_SQRT_2PI;
1132
0
          if(!lower) p = 1.0 - p;
1133
0
        } else {
1134          double new_val, old_val;
1135
5
          z = -2.0 * x * x;
1136          s = -1.0;
1137          if(lower) {
1138                   k = 1; old_val = 0.0; new_val = 1.0;
1139
1
          } else {
1140
1
                   k = 2; old_val = 0.0; new_val = 2.0 * exp(z);
1141
1
          }
1142
1
          while(fabs(old_val - new_val) > tol) {
1143
1
                   old_val = new_val;
1144
1
                   new_val += 2.0 * s * exp(z * k * k);
1145                   s *= -1.0;
1146
20
                   k++;
1147
380
          }
1148
361
          p = new_val;
1149
208
        }
1150        return p;
1151}
1152
1153
19
// Auxiliary routines used by K2x() for matrix operations
1154
19
static void m_multiply(double *A, double *B, double *C, unsigned int m) {
1155        for(unsigned int i = 0; i < m; i++) {
1156
1
          for(unsigned int j = 0; j < m; j++) {
1157                   double s = 0.;
1158
20
                   for(unsigned int k = 0; k < m; k++) s += A[i * m + k] * B[k * m + j];
1159
380
                   C[i * m + j] = s;
1160
361
          }
1161
1520
        }
1162}
1163
1164static void m_power(double *A, int eA, double *V, int *eV, int m, int n) {
1165        if(n == 1) {
1166
1
          for(int i = 0; i < m * m; i++) V[i] = A[i];
1167
1
          *eV = eA;
1168
1
          return;
1169        }
1170
51
        m_power(A, eA, V, eV, m, n / 2);
1171
50
        double *restrict B = (double*) safecalloc(m * m, sizeof(double));
1172
50
        m_multiply(V, V, B, m);
1173
0
        int eB = 2 * (*eV);
1174
0
        if((n % 2) == 0) {
1175          for(int i = 0; i < m * m; i++) V[i] = B[i];
1176          *eV = eB;
1177
1
        } else {
1178
1
          m_multiply(A, B, V, m);
1179
1
          *eV = eA + eB;
1180
1
        }
1181        if(V[(m / 2) * m + (m / 2)] > 1e140) {
1182          for(int i = 0; i < m * m; i++) V[i] = V[i] * 1e-140;
1183          *eV += 140;
1184
3
        }
1185        Safefree(B);
1186
3
}
1187
1188
3
// One-sample two-sided exact distribution
1189
3
static double K2x(int n, double d) {
1190        int k = (int) (n * d) + 1;
1191
243
        int m = 2 * k - 1;
1192        double h = k - n * d;
1193
240
        double *restrict H = (double*) safecalloc(m * m, sizeof(double));
1194
39
        double *restrict Q = (double*) safecalloc(m * m, sizeof(double));
1195
1196        for(int i = 0; i < m; i++) {
1197
390
          for(int j = 0; j < m; j++) {
1198
330
                   if(i - j + 1 < 0) H[i * m + j] = 0;
1199                   else H[i * m + j] = 1;
1200
240
          }
1201
240
        }
1202
240
        for(int i = 0; i < m; i++) {
1203          H[i * m] -= r_pow_di(h, i + 1);
1204
240
          H[(m - 1) * m + i] -= r_pow_di(h, (m - i));
1205
240
        }
1206
240
        H[(m - 1) * m] += ((2 * h - 1 > 0) ? r_pow_di(2 * h - 1, m) : 0);
1207
1208
3
        for(int i = 0; i < m; i++) {
1209
3
          for(int j = 0; j < m; j++) {
1210
3
                   if(i - j + 1 > 0) {
1211
3
                       for(int g = 1; g <= i - j + 1; g++) H[i * m + j] /= g;
1212                   }
1213          }
1214
4740
        }
1215
1216
3160
        int eH = 0, eQ;
1217        m_power(H, eH, Q, &eQ, m, n);
1218        double s = Q[(k - 1) * m + k - 1];
1219
1220
3
        for(int i = 1; i <= n; i++) {
1221
3
          s = s * (double)i / (double)n;
1222
3
          if(s < 1e-140) {
1223
3
                   s *= 1e140;
1224                   eQ -= 140;
1225
93
          }
1226
90
        }
1227
72
        s *= pow(10.0, eQ);
1228        Safefree(H);
1229
153
        Safefree(Q);
1230
150
        return s;
1231
4650
}
1232
1233// Calculate D (two-sided), D+ (greater), and D- (less) simultaneously
1234
3334
static void calc_2sample_stats(double *x, size_t nx, double *y, size_t ny,
1235
3334
                               double *d, double *d_plus, double *d_minus) {
1236
3334
        qsort(x, nx, sizeof(double), compare_doubles);
1237        qsort(y, ny, sizeof(double), compare_doubles);
1238        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
1239        size_t i = 0, j = 0;
1240
1241
3
        while(i < nx || j < ny) {
1242
3
          double val;
1243          if (i < nx && j < ny) val = (x[i] < y[j]) ? x[i] : y[j];
1244          else if (i < nx) val = x[i];
1245
229
          else val = y[j];
1246
1247
229
          while(i < nx && x[i] <= val) i++;
1248          while(j < ny && y[j] <= val) j++;
1249
1250
229
          double cdf1 = (double)i / nx;
1251
229
          double cdf2 = (double)j / ny;
1252          double diff = cdf1 - cdf2;
1253
1254          if (diff > max_d_plus) max_d_plus = diff;
1255
229
          if (-diff > max_d_minus) max_d_minus = -diff;
1256          if (fabs(diff) > max_d) max_d = fabs(diff);
1257
0
        }
1258        *d = max_d;
1259        *d_plus = max_d_plus;
1260        *d_minus = max_d_minus;
1261
229
}
1262
1263// Branch the DP boundary check based on the 'alternative'
1264static int psmirnov_exact_test(double q, double r, double s, int two_sided) {
1265        if (two_sided) return (fabs(r - s) >= q);
1266        return ((r - s) >= q); // Used for both D+ and D- via symmetry
1267
6
}
1268
1269
6
// Evaluate the exact 2-sample probability
1270
6
static double psmirnov_exact_uniq_upper(double q, int m, int n, int two_sided) {
1271        double md = (double) m, nd = (double) n;
1272
20
        double *restrict u = (double *) safecalloc(n + 1, sizeof(double));
1273
14
        u[0] = 0.;
1274
1275
14
        for(unsigned int j = 1; j <= n; j++) {
1276          if(psmirnov_exact_test(q, 0., j / nd, two_sided)) u[j] = 1.;
1277          else u[j] = u[j - 1];
1278        }
1279
251
        for(unsigned int i = 1; i <= m; i++) {
1280
251
                if(psmirnov_exact_test(q, i / md, 0., two_sided)) u[0] = 1.;
1281
251
                for(int j = 1; j <= n; j++) {
1282                        if(psmirnov_exact_test(q, i / md, j / nd, two_sided)) u[j] = 1.;
1283
251
                        else {
1284
122
                                 double v = (double)(i) / (double)(i + j);
1285                                 double w = (double)(j) / (double)(i + j);
1286
129
                                 u[j] = v * u[j] + w * u[j - 1];
1287                        }
1288
251
                }
1289        }
1290
6
        double res = u[n];
1291        Safefree(u);
1292        return res;
1293}
1294
1295static double p_body(double n, double delta, double sd, double sig_level, int tsample, int tside, bool strict) {
1296        double nu = (n - 1.0) * (double)tsample;
1297        if (nu < 1e-7) nu = 1e-7;
1298
1299
4
        // Ensure sig_level/tside is not truncated
1300
4
        double p_tail = sig_level / (double)tside;
1301
4
        double qu = qt_tail(nu, p_tail); // qt(p, df, lower.tail=FALSE)
1302
1303        double ncp = sqrt(n / (double)tsample) * (delta / sd);
1304
1305
4
        if (strict && tside == 2) {
1306
4
          // Use R-style tail calls: 1 - P(T < qu) + P(T < -qu)
1307          return (1.0 - exact_pnt(qu, nu, ncp)) + exact_pnt(-qu, nu, ncp);
1308        } else {
1309
4
          // Default: 1 - P(T < qu)
1310
4
          // Ensure exact_pnt is using a convergence tolerance of at least 1e-15
1311
3
          return 1.0 - exact_pnt(qu, nu, ncp);
1312
3
        }
1313
1
}
1314
1315
1
// Bisection algorithm to find the inverse F-distribution (Quantile function)
1316// Equivalent to R's qf(p, df1, df2)
1317static double qf_bisection(double p, double df1, double df2) {
1318        if (p <= 0.0) return 0.0;
1319        if (p >= 1.0) return INFINITY;
1320
6
        double low = 0.0, high = 1.0;
1321
2
        // Find upper bound
1322
2
        while (pf(high, df1, df2) < p) {
1323
2
          low = high;
1324
2
          high *= 2.0;
1325
2
          if (high > 1e100) break; /* Fallback limit */
1326
0
        }
1327
1328        // Bisect to find the root
1329
2
        for (unsigned short int i = 0; i < 150; i++) {
1330
0
                double mid = low + (high - low) / 2.0;
1331                double p_mid = pf(mid, df1, df2);
1332
1333
4
                if (p_mid < p) {
1334
0
                        low = mid;
1335                } else {
1336                        high = mid;
1337
4
                }
1338
4
                if (high - low < 1e-12) break;
1339
4
        }
1340        return (low + high) / 2.0;
1341
4
}
1342
0
/* oneway_test  â€“  Welch / classic one-way ANOVA
1343 *
1344 * ── Mode 1: hash of groups (original behaviour) ───────────────────────────
1345
4
*
1346
4
*   my $res = oneway_test(\%groups);
1347
4
*   my $res = oneway_test(\%groups, var_equal => 1);
1348 *
1349 *   \%groups  â€“ keys are group labels, values are array refs of numbers.
1350
4
*               Every group must have >= 2 observations.
1351
4
*
1352
204
* ── Mode 2: formula – response ~ factor ───────────────────────────────────
1353
200
*
1354
200
*   my $res = oneway_test(\%data, formula => "yield ~ ctrl");
1355
200
*   my $res = oneway_test(\%data, formula => "yield ~ ctrl", var_equal => 1);
1356 *
1357 *   \%data must contain two keys matching the formula:
1358 *     "yield" => [ numeric response values ... ]
1359
4
*     "ctrl"  => [ group labels (strings or numbers, same length) ... ]
1360
4
*
1361 *   This mirrors R's:
1362 *     my_data <- stack(list(yield = yield, ctrl = ctrl))
1363
7
*     oneway.test(Value ~ Group, data = my_data)
1364
3
*
1365
3
*   Absence of a formula argument falls back to Mode 1 automatically.
1366 *
1367
3
* ── Return value (both modes)
1368
3
*
1369
93
*   Hash ref with keys:
1370
90
*     statistic => F value
1371
90
*     num_df    => numerator degrees of freedom   (k − 1)
1372
90
*     denom_df  => denominator degrees of freedom
1373 *     p_value   => upper-tail p-value  P(F ≥ statistic)
1374 *     method    => description string
1375 *     k         => number of groups
1376
3
*     n         => total observations
1377
0
*     formula   => "response ~ factor"  (only present in Mode 2)
1378
0
*
1379 * =========================================================================
1380 * Integration: drop the C block above "--- XS SECTION ---", and the XS
1381 * block inside your MODULE … PACKAGE … PREFIX = section.
1382
3
* =========================================================================
1383 */
1384
1385
3
/* -----------------------------------------------------------------------
1386
2
* C HELPERS  (place above "--- XS SECTION ---")
1387
1
* ----------------------------------------------------------------------- */
1388
1389/* ── OneWayResult struct ─────────────────────────────────────────────── */
1390
3
typedef struct {
1391
3
        double  statistic;
1392
3
        double  num_df;
1393
3
        double  denom_df;
1394        double  p_value;
1395        double  ss_between;  /* between-group sum of squares  */
1396
3
        double  ss_within;   /* within-group  sum of squares  */
1397
3
        double  ms_between;  /* ss_between / num_df           */
1398
153
        double  ms_within;   /* ss_within  / denom_df         */
1399
93
        int     k;           /* number of groups              */
1400
3
        IV      n;           /* total observations            */
1401        int     var_equal;   /* 0 = Welch, 1 = classic        */
1402
3
} OneWayResult;
1403
1404
237
/* ── c_oneway_test ───────────────────────────────────────────────────────
1405 *
1406
3
*  data      â€“ flat C array of all observations, groups concatenated
1407
3
*  sizes     â€“ n_i for each group (length k)
1408
0
*  k         â€“ number of groups
1409
0
*  var_equal – 0 = Welch (default), 1 = classic equal-variance F-test
1410 *
1411
3
*  Mirrors R's oneway.test() arithmetic exactly.
1412
3
*  Calls pf(f, df1, df2) declared elsewhere in the .xs file.
1413
3
* ----------------------------------------------------------------------- */
1414
3
static OneWayResult
1415c_oneway_test(const double *restrict data,
1416
0
              const size_t *restrict sizes,
1417
0
              size_t k,
1418
0
              int var_equal)
1419
0
{
1420        OneWayResult res;
1421
0
        res.var_equal = var_equal;
1422        res.k         = (int)k;
1423
1424
3
        double *restrict n_i = (double *)safemalloc(k * sizeof(double));
1425        double *restrict m_i = (double *)safemalloc(k * sizeof(double));
1426        double *restrict v_i = (double *)safemalloc(k * sizeof(double));
1427
1428
1
        size_t offset = 0;
1429
1
        IV total_n = 0;
1430
1
        for (size_t g = 0; g < k; g++) {
1431
1
          size_t ng  = sizes[g];
1432
51
          n_i[g]     = (double)ng;
1433
50
          total_n   += (IV)ng;
1434
1435
50
          double sum = 0.0;
1436          for (size_t i = 0; i < ng; i++) sum += data[offset + i];
1437
50
          double mean = sum / (double)ng;
1438
50
          m_i[g] = mean;
1439
1440
50
          double ss = 0.0;
1441
50
          for (size_t i = 0; i < ng; i++) {
1442
50
                   double d = data[offset + i] - mean;
1443
50
                   ss += d * d;
1444          }
1445
50
          v_i[g] = ss / (double)(ng - 1);   /* ng >= 2 guaranteed by caller */
1446
50
          offset += ng;
1447        }
1448
1449
1
        res.n = total_n;
1450
1451
1
        /* grand mean (simple average over all obs; used only by classic branch) */
1452
1
        double grand_mean = 0.0;
1453
1
        for (IV i = 0; i < (IV)total_n; i++) grand_mean += data[i];
1454
1
        grand_mean /= (double)total_n;
1455
1456        double df1 = (double)(k - 1);
1457
1458
0
        if (var_equal) {
1459
0
                /* ── Classic one-way ANOVA ─────────────────────────────────────── *
1460                *  F = [Σ n_i·(m_i − ȳ)² / (k−1)]  /  [Σ (n_i−1)·v_i / (n−k)] *
1461                * ─────────────────────────────────────────────────────────────── */
1462
0
                double ssbg = 0.0, sswg = 0.0;
1463
0
                for (size_t g = 0; g < k; g++) {
1464
0
                        double dm = m_i[g] - grand_mean;
1465
0
                        ssbg += n_i[g] * dm * dm;
1466                        sswg += (n_i[g] - 1.0) * v_i[g];
1467                }
1468
0
                double df2    = (double)(total_n - (IV)k);
1469
0
                res.statistic = (ssbg / df1) / (sswg / df2);
1470                res.num_df    = df1;
1471                res.denom_df  = df2;
1472
0
                res.ss_between = ssbg;
1473
0
                res.ss_within  = sswg;
1474                res.ms_between = ssbg / df1;
1475
4
                res.ms_within  = sswg / df2;
1476
4
        } else {
1477
4
                /* ── Welch one-way (heteroscedastic) ───────────────────────────── *
1478
4
                *  w_i  = n_i / v_i                                               *
1479
4
                *  W    = Σ w_i                                                   *
1480
4
                *  m̃    = Σ(w_i·m_i) / W          (weighted grand mean)           *
1481
4
                *  tmp  = Σ[(1 − w_i/W)² / (n_i−1)] / (k²−1)                    *
1482
4
                *  F    = Σ[w_i·(m_i − m̃)²] / [(k−1)·(1 + 2·(k−2)·tmp)]        *
1483
4
                *  df2  = 1 / (3·tmp)                                             *
1484                *                                                                 *
1485                *  SS values use the unweighted grand mean (same as classic)      *
1486                *  so the output table is always populated.                        *
1487                * ─────────────────────────────────────────────────────────────── */
1488                double *restrict w_i = (double *)safemalloc(k * sizeof(double));
1489                double sum_w = 0.0;
1490                for (size_t g = 0; g < k; g++) { w_i[g] = n_i[g] / v_i[g]; sum_w += w_i[g]; }
1491
10
                double wgrand = 0.0;
1492
10
                for (size_t g = 0; g < k; g++) wgrand += w_i[g] * m_i[g];
1493
10
                wgrand /= sum_w;
1494
10
                double tmp = 0.0;
1495
10
                for (size_t g = 0; g < k; g++) {
1496
10
                        double t = 1.0 - w_i[g] / sum_w;
1497                        tmp += (t * t) / (n_i[g] - 1.0);
1498
10
                }
1499
2
                tmp /= ((double)k * (double)k - 1.0);   /* k² − 1 */
1500
2
                double num = 0.0;
1501                for (size_t g = 0; g < k; g++) {
1502                        double dm = m_i[g] - wgrand;
1503
10
                        num += w_i[g] * dm * dm;
1504
2
                }
1505
2
                res.statistic = num / (df1 * (1.0 + 2.0 * (double)(k - 2) * tmp));
1506                res.num_df    = df1;
1507                res.denom_df  = (tmp > 0.0) ? (1.0 / (3.0 * tmp)) : 1e300;
1508
10
                /* unweighted SS for the output table */
1509
0
                double ssbg = 0.0, sswg = 0.0;
1510                for (size_t g = 0; g < k; g++) {
1511                        double dm = m_i[g] - grand_mean;
1512
30
                        ssbg += n_i[g] * dm * dm;
1513
20
                        sswg += (n_i[g] - 1.0) * v_i[g];
1514
20
                }
1515
20
                res.ss_between = ssbg;
1516
13
                res.ss_within  = sswg;
1517
6
                res.ms_between = (df1  > 0.0) ? ssbg / df1          : 0.0;
1518
3
                res.ms_within  = (res.denom_df > 0.0) ? sswg / res.denom_df : 0.0;
1519
3
                Safefree(w_i);
1520
2
        }
1521
0
        /* upper-tail p-value  P(F ≥ statistic) */
1522
0
        res.p_value = 1 - pf(res.statistic, res.num_df, res.denom_df);
1523        Safefree(n_i);    Safefree(m_i);    Safefree(v_i);
1524
2
        return res;
1525
0
}
1526
1527/* ── parse_formula
1528
10
*
1529
1
*  Splits "response ~ factor" into two NUL-terminated, heap-allocated
1530
9
*  strings.  Leading/trailing whitespace is stripped from each side.
1531
9
*  Returns 1 on success, 0 on failure (malformed / missing '~').
1532
9
*  Caller must Safefree() both *lhs and *rhs on success.
1533 * ----------------------------------------------------------------------- */
1534
9
static int
1535
9
parse_formula(const char *formula, char **lhs, char **rhs)
1536
9
{
1537
8
        const char *restrict tilde = strchr(formula, '~');
1538
8
        if (!tilde) return 0;
1539
1540
9
        /* left-hand side: trim trailing whitespace */
1541
9
        const char *l_start = formula;
1542
9
        const char *l_end   = tilde - 1;
1543        while (l_end >= l_start && isspace((unsigned char)*l_end)) l_end--;
1544
14
        if (l_end < l_start) return 0;             /* empty LHS */
1545
1546
5
        /* right-hand side: trim leading whitespace */
1547
33
        const char *restrict r_start = tilde + 1;
1548
28
        while (*r_start && isspace((unsigned char)*r_start)) r_start++;
1549
28
        const char *restrict r_end = r_start + strlen(r_start) - 1;
1550
28
        while (r_end >= r_start && isspace((unsigned char)*r_end)) r_end--;
1551
28
        if (r_end < r_start) return 0;             /* empty RHS */
1552
1553        size_t llen = (size_t)(l_end - l_start + 1);
1554        size_t rlen = (size_t)(r_end - r_start + 1);
1555
1556
28
        *lhs = (char *)safemalloc(llen + 1);
1557
28
        *rhs = (char *)safemalloc(rlen + 1);
1558
28
        memcpy(*lhs, l_start, llen); (*lhs)[llen] = '\0';
1559
28
        memcpy(*rhs, r_start, rlen); (*rhs)[rlen] = '\0';
1560
28
        return 1;
1561}
1562
1563
5
/* ── build_groups_from_formula ───────────────────────────────────────────
1564
5
*
1565
5
*  Takes parallel response[] and label[] arrays (each length n) and
1566
5
*  partitions them into groups, filling:
1567
5
*    out_flat[]  â€“ observations sorted into contiguous group blocks
1568
5
*    out_sizes[] – number of observations per group  (caller allocates n
1569
61
*                  slots for both; actual group count returned via *out_k)
1570
5
*    out_names   â€“ if non-NULL, receives a heap-allocated char** of k
1571 *                  group-name strings (caller must free each and the array)
1572
5
*
1573
5
*  Group identity is the string representation of each label element
1574
5
*  (SvPV_nolen), so integer 0 and string "0" are the same group.
1575 *  Groups are ordered by first appearance in label[], matching R's
1576
5
*  factor level ordering from stack().
1577
0
*
1578
0
*  Returns 1 on success; 0 if any validation error (sets errbuf).
1579 */
1580
5
#define OWT_MAX_GROUPS 1024   /* sane ceiling; ANOVA with >1024 groups is absurd */
1581
1582
2
static int
1583
2
build_groups_from_formula(pTHX_
1584        AV *restrict response_av,
1585
2
        AV *restrict label_av,
1586
1
        double *restrict out_flat,
1587        size_t *restrict out_sizes,
1588
0
        size_t *restrict out_k,
1589
0
        char ***restrict out_names,
1590        char *restrict errbuf,
1591        size_t errbuf_len)
1592
3
{
1593
3
        IV n = av_len(response_av) + 1;
1594
3
        IV nl = av_len(label_av)   + 1;
1595
1596        if (n != nl) {
1597
3
          snprintf(errbuf, errbuf_len,
1598
3
                   "formula: response length (%"IVdf") != factor length (%"IVdf")",
1599
3
                   n, nl);
1600
0
          return 0;
1601
0
        }
1602        if (n < 2) {
1603
3
          snprintf(errbuf, errbuf_len, "formula: need at least 2 observations");
1604          return 0;
1605
3
        }
1606
1607
3
        /* ── discover unique group labels in order of first appearance ─── */
1608        /* We store pointers into a heap-allocated label string table.       */
1609
5
        char  **restrict group_names  = (char **)safemalloc(OWT_MAX_GROUPS * sizeof(char *));
1610        size_t  ngroups      = 0;
1611
4
        IV     *restrict obs_group    = (IV *)safemalloc((size_t)n * sizeof(IV));
1612
3
                /* maps obs index → group index */
1613
1614
3
        for (IV i = 0; i < n; i++) {
1615
26
          SV **lsv = av_fetch(label_av, i, 0);
1616
23
          const char *label = (lsv && *lsv) ? SvPV_nolen(*lsv) : "";
1617
1618
23
          /* linear scan for existing group (k is small, O(n·k) is fine) */
1619          IV gidx = -1;
1620
23
          for (size_t g = 0; g < ngroups; g++) {
1621
18
                   if (strEQ(group_names[g], label)) { gidx = (IV)g; break; }
1622
18
          }
1623
18
          if (gidx < 0) {
1624
18
                   if (ngroups >= OWT_MAX_GROUPS) {
1625
18
                       snprintf(errbuf, errbuf_len,
1626
18
                           "formula: too many distinct groups (max %d)", OWT_MAX_GROUPS);
1627                       Safefree(group_names);
1628
5
                       Safefree(obs_group);
1629
5
                       return 0;
1630
5
                   }
1631                   /* new group: copy the label string */
1632                   size_t lablen = strlen(label);
1633
3
                   group_names[ngroups] = (char *)safemalloc(lablen + 1);
1634
0
                   memcpy(group_names[ngroups], label, lablen + 1);
1635
0
                   gidx = (IV)ngroups++;
1636          }
1637
3
          obs_group[i] = gidx;
1638
26
        }
1639
1640
23
        if (ngroups < 2) {
1641          snprintf(errbuf, errbuf_len,
1642
3
                   "formula: need at least 2 distinct groups, found %zu", ngroups);
1643
3
          for (size_t g = 0; g < ngroups; g++) Safefree(group_names[g]);
1644
3
          Safefree(group_names);  Safefree(obs_group);
1645
26
          return 0;
1646
23
        }
1647
1648
3
        /* count per-group sizes */
1649
3
        memset(out_sizes, 0, ngroups * sizeof(size_t));
1650
3
        for (unsigned i = 0; i < n; i++) out_sizes[obs_group[i]]++;
1651
1652
0
        /* validate: every group needs >= 2 observations */
1653
0
        for (size_t g = 0; g < ngroups; g++) {
1654          if (out_sizes[g] < 2) {
1655
3
                   snprintf(errbuf, errbuf_len,
1656
0
                       "formula: group '%s' has only %zu observation(s); need >= 2",
1657
0
                       group_names[g], out_sizes[g]);
1658                   for (size_t gg = 0; gg < ngroups; gg++) Safefree(group_names[gg]);
1659
3
                   Safefree(group_names);  Safefree(obs_group);
1660
3
                   return 0;
1661
3
          }
1662
3
        }
1663
1664
3
        /* ── fill flat output array in group order ─────────────────────── *
1665
3
        *  We compute a running write-offset per group, then scatter.      *
1666        */
1667
3
        size_t *restrict write_pos = (size_t *)safemalloc(ngroups * sizeof(size_t));
1668
3
        write_pos[0] = 0;
1669        for (size_t g = 1; g < ngroups; g++)
1670          write_pos[g] = write_pos[g - 1] + out_sizes[g - 1];
1671
1672
0
        for (IV i = 0; i < n; i++) {
1673
0
          SV **restrict rsv = av_fetch(response_av, i, 0);
1674
0
          double val = (rsv && *rsv) ? SvNV(*rsv) : 0.0;
1675
0
          size_t g   = (size_t)obs_group[i];
1676
0
          out_flat[write_pos[g]++] = val;
1677
0
        }
1678
1679
0
        *out_k = ngroups;
1680
1681
0
        /* ── clean up or hand off group names */
1682        Safefree(write_pos);    Safefree(obs_group);
1683
0
        if (out_names) {
1684
0
          *out_names = group_names;   /* caller takes ownership */
1685
0
        } else {
1686          for (size_t g = 0; g < ngroups; g++) Safefree(group_names[g]);
1687
3
          Safefree(group_names);
1688        }
1689
8
        return 1;
1690
8
}
1691
8
#undef OWT_MAX_GROUPS
1692
1693
8
// --- XS SECTION ---
1694
8
MODULE = Stats::LikeR  PACKAGE = Stats::LikeR
1695
1696SV *oneway_test(data_ref, ...)
1697        SV *data_ref
1698        PREINIT:
1699    HV          *restrict in_hv = NULL;
1700    AV          *restrict in_av = NULL;
1701    HE          *restrict he;
1702    bool         var_equal = 0;
1703    const char  *restrict formula_str  = NULL;
1704
3
    const char  *restrict factor_name  = "Group";
1705
3
    char        *lhs = NULL, *rhs = NULL;
1706
3
    double      *restrict flat   = NULL;
1707
3
    size_t      *restrict sizes  = NULL;
1708
3
    char       ** gnames = NULL;
1709
2
    double      *restrict gmeans = NULL;
1710
2
    size_t       k = 0;
1711
2
    IV           total_n = 0;
1712    OneWayResult res;
1713
1
    HV          *restrict ret_hv;
1714
1
    char         errbuf[512];
1715    CODE:
1716        /* parse named arguments */
1717
3
        for (I32 ai = 1; ai + 1 < items; ai += 2) {
1718
3
                const char *restrict key = SvPV_nolen(ST(ai));
1719
3
                SV *restrict val = ST(ai + 1);
1720                if (strEQ(key, "var_equal"))
1721
3
                        var_equal = SvTRUE(val) ? 1 : 0;
1722
3
                else if (strEQ(key, "formula"))
1723
2
                        formula_str = SvPV_nolen(val);
1724
2
        }
1725
6
        /* validate data_ref and determine if it's an Array or Hash */
1726
7
        if (!SvROK(data_ref))
1727
6
          croak("oneway_test: first argument must be a hash or array reference");
1728
1729
4
        SV *restrict rv = SvRV(data_ref);
1730
14
        if (SvTYPE(rv) == SVt_PVHV) {
1731
10
            in_hv = (HV *)rv;
1732
10
        } else if (SvTYPE(rv) == SVt_PVAV) {
1733
10
            in_av = (AV *)rv;
1734
10
        } else {
1735
10
            croak("oneway_test: first argument must be a hash or array reference");
1736        }
1737        if (in_av) {
1738
6
            /* MODE 3 – Array of Arrays (AoA) */
1739
4
            if (formula_str != NULL)
1740
4
                 croak("oneway_test: formula mode is not supported with an array of arrays");
1741
1742
14
            k = (size_t)av_len(in_av) + 1;
1743
10
            if (k < 2)
1744
10
                croak("oneway_test: need at least 2 groups, got %zu", k);
1745
10
            sizes  = (size_t *)safemalloc(k * sizeof(size_t));
1746
10
            gnames = (char  **)safemalloc(k * sizeof(char *));
1747
10
            /* first pass: sizes, total_n, and generate index names */
1748            for (size_t g = 0; g < k; g++) {
1749
4
                SV **restrict val = av_fetch(in_av, (I32)g, 0);
1750
4
                if (!val || !*val || !SvROK(*val) || SvTYPE(SvRV(*val)) != SVt_PVAV)
1751
4
                    croak("oneway_test: index %zu is not an array reference", g);
1752
4
                IV len = av_len((AV *)SvRV(*val)) + 1;
1753                if (len < 2)
1754
6
                     croak("oneway_test: index %zu has fewer than 2 observations", g);
1755                sizes[g] = (size_t)len;
1756                total_n += (IV)len;
1757
4
                /* synthesize group names: "Index 0", "Index 1", ... to match 0-based index */
1758                char buf[64];
1759
2
                snprintf(buf, sizeof(buf), "Index %zu", g);
1760
2
                size_t klen = strlen(buf);
1761                gnames[g] = (char *)safemalloc(klen + 1);
1762
4
                memcpy(gnames[g], buf, klen + 1);
1763
3
            }
1764
3
            /* second pass: fill flat array */
1765            flat = (double *)safemalloc((size_t)total_n * sizeof(double));
1766
1
            size_t offset = 0;
1767
4
            for (size_t g = 0; g < k; g++) {
1768
3
                SV **restrict val = av_fetch(in_av, (I32)g, 0);
1769
3
                AV *restrict av = (AV *)SvRV(*val);
1770
3
                IV len = av_len(av) + 1;
1771
3
                for (IV i = 0; i < len; i++) {
1772                    SV **restrict svp = av_fetch(av, i, 0);
1773
1
                    flat[offset++] = (svp && *svp) ? SvNV(*svp) : 0.0;
1774                }
1775
3
            }
1776
3
        } else if (formula_str != NULL) {
1777
3
          /* MODE 2 – formula  "response ~ factor" */
1778
3
          if (!parse_formula(formula_str, &lhs, &rhs))
1779
3
                   croak("oneway_test: cannot parse formula '%s' — "
1780
3
                         "expected 'response ~ factor'", formula_str);
1781
3
          factor_name = rhs;   /* use the actual factor variable name */
1782
2
          SV **restrict resp_svp = hv_fetch(in_hv, lhs, (I32)strlen(lhs), 0);
1783
1
          if (!resp_svp || !*resp_svp || !SvROK(*resp_svp)
1784                   || SvTYPE(SvRV(*resp_svp)) != SVt_PVAV)
1785
1
                   croak("oneway_test: formula LHS '%s' not found as an array ref "
1786                         "in the hash", lhs);
1787          SV **restrict fact_svp = hv_fetch(in_hv, rhs, (I32)strlen(rhs), 0);
1788
1
          if (!fact_svp || !*fact_svp || !SvROK(*fact_svp)
1789                   || SvTYPE(SvRV(*fact_svp)) != SVt_PVAV)
1790
3
                   croak("oneway_test: formula RHS '%s' not found as an array ref "
1791                         "in the hash", rhs);
1792          AV *restrict resp_av  = (AV *)SvRV(*resp_svp);
1793          AV *restrict label_av = (AV *)SvRV(*fact_svp);
1794          IV  n = av_len(resp_av) + 1;
1795          flat  = (double *)safemalloc((size_t)n * sizeof(double));
1796          sizes = (size_t *)safemalloc((size_t)n * sizeof(size_t));
1797          if (!build_groups_from_formula(aTHX_ resp_av, label_av,
1798                                              flat, sizes, &k, &gnames,
1799                                              errbuf, sizeof errbuf)) {
1800
13
                   Safefree(flat);
1801
13
                   Safefree(sizes); Safefree(lhs); Safefree(rhs);
1802
13
                   croak("oneway_test: %s", errbuf);
1803          }
1804          for (size_t g = 0; g < k; g++) total_n += (IV)sizes[g];
1805
13
        } else {
1806
13
                /* MODE 1 – hash of groups  { label => \@observations, … } */
1807
13
                k = (size_t)hv_iterinit(in_hv);
1808
13
                if (k < 2)
1809
13
                        croak("oneway_test: need at least 2 groups, got %zu", k);
1810                sizes  = (size_t *)safemalloc(k * sizeof(size_t));
1811                gnames = (char  **)safemalloc(k * sizeof(char *));
1812
13
                /* first pass: sizes, total_n, and group name strings */
1813
13
                {
1814
13
                        size_t g = 0;
1815                        while ((he = hv_iternext(in_hv)) != NULL) {
1816                                SV *restrict val = HeVAL(he);
1817
13
                                if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
1818
13
                                    croak("oneway_test: value for group '%s' is not an array ref",
1819
13
                                          HePV(he, PL_na));
1820                                IV len = av_len((AV *)SvRV(val)) + 1;
1821                                if (len < 2)
1822
37
                                     croak("oneway_test: group '%s' has fewer than 2 observations",
1823
24
                                           HePV(he, PL_na));
1824
24
                                sizes[g] = (size_t)len;
1825
24
                                total_n += (IV)len;
1826
24
                                /* save a copy of the key string */
1827
24
                                STRLEN klen;
1828
19
                                const char *kstr = HePV(he, klen);
1829
19
                                gnames[g] = (char *)safemalloc(klen + 1);
1830
11
                                memcpy(gnames[g], kstr, klen + 1);
1831
0
                                g++;
1832                        }
1833                }
1834
13
                /* second pass: fill flat in the same iteration order */
1835
0
                flat = (double *)safemalloc((size_t)total_n * sizeof(double));
1836                {
1837
13
                        size_t offset = 0;
1838
13
                        hv_iterinit(in_hv);
1839
0
                        while ((he = hv_iternext(in_hv)) != NULL) {
1840                                 AV *restrict av  = (AV *)SvRV(HeVAL(he));
1841                                 IV  len = av_len(av) + 1;
1842
13
                                 for (IV i = 0; i < len; i++) {
1843
13
                                     SV **restrict svp = av_fetch(av, i, 0);
1844                                     flat[offset++] = (svp && *svp) ? SvNV(*svp) : 0.0;
1845
13
                                 }
1846
5
                        }
1847
1
                }
1848        }
1849        /* per-group means from flat (before c_oneway_test frees nothing) */
1850        gmeans = (double *)safemalloc(k * sizeof(double));
1851
12
        {
1852
12
                size_t offset = 0;
1853                for (size_t g = 0; g < k; g++) {
1854                        double sum = 0.0;
1855
12
                        for (size_t i = 0; i < sizes[g]; i++) sum += flat[offset + i];
1856
9
                        gmeans[g] = sum / (double)sizes[g];
1857
9
                        offset   += sizes[g];
1858                }
1859
9
        }
1860
9
        /* run the arithmetic  */
1861
9
        res = c_oneway_test(flat, sizes, k, var_equal);
1862
0
        Safefree(flat);
1863        if (lhs) Safefree(lhs);
1864
9
        /* rhs kept alive as factor_name until after output */
1865
9
        /* ── build return hash ref
1866
0
        * {                                                                 *
1867        * <factor>  => { Df, "Sum Sq", "Mean Sq", "F value", "Pr(>F)" }  *
1868
9
        * Residuals => { Df, "Sum Sq", "Mean Sq" }                        *
1869
9
        * group_stats => { mean => { g => v, … }, size => { g => n, … } } *
1870
9
        * }                                                                 *
1871
32
        */
1872
23
        ret_hv = (HV *)sv_2mortal((SV *)newHV());
1873
23
        /* Group (factor) sub-hash */
1874
0
        {
1875                HV *restrict g_hv = newHV();
1876                hv_stores(g_hv, "Df",      newSVnv(res.num_df));
1877
9
                hv_stores(g_hv, "Sum Sq",  newSVnv(res.ss_between));
1878
4
                hv_stores(g_hv, "Mean Sq", newSVnv(res.ms_between));
1879
4
                hv_stores(g_hv, "F value", newSVnv(res.statistic));
1880
12
                hv_stores(g_hv, "Pr(>F)",  newSVnv(res.p_value));
1881
8
                hv_store(ret_hv, factor_name, (I32)strlen(factor_name),
1882                                  newRV_noinc((SV *)g_hv), 0);
1883        }
1884        /* Residuals sub-hash */
1885
3
        {
1886
3
                HV *restrict r_hv = newHV();
1887
3
                hv_stores(r_hv, "Df",      newSVnv(res.denom_df));
1888
3
                hv_stores(r_hv, "Sum Sq",  newSVnv(res.ss_within));
1889
0
                hv_stores(r_hv, "Mean Sq", newSVnv(res.ms_within));
1890                hv_stores(ret_hv, "Residuals", newRV_noinc((SV *)r_hv));
1891        }
1892
10
        /* group_stats sub-hash */
1893
7
        {
1894
7
                HV *restrict gs_hv   = newHV();
1895
0
                HV *restrict mean_hv = newHV();
1896                HV *restrict size_hv = newHV();
1897                for (size_t g = 0; g < k; g++) {
1898
3
                        const char *restrict gn  = gnames[g];
1899                        I32         gnl = (I32)strlen(gn);
1900                        hv_store(mean_hv, gn, gnl, newSVnv(gmeans[g]),       0);
1901
12
                        hv_store(size_hv, gn, gnl, newSViv((IV)sizes[g]),    0);
1902
12
                }
1903                hv_stores(gs_hv, "mean", newRV_noinc((SV *)mean_hv));
1904
12
                hv_stores(gs_hv, "size", newRV_noinc((SV *)size_hv));
1905
12
                hv_stores(ret_hv, "group_stats", newRV_noinc((SV *)gs_hv));
1906
12
        }
1907        /* clean up */
1908        Safefree(gmeans);       Safefree(sizes);
1909
12
        for (size_t g = 0; g < k; g++) Safefree(gnames[g]);
1910
5
        Safefree(gnames);
1911
1
        if (rhs) Safefree(rhs);
1912
4
        /* freed here, after factor_name is no longer needed */
1913
3
        RETVAL = newRV((SV *)ret_hv);
1914
3
  OUTPUT:
1915    RETVAL
1916
1917
3
SV* ks_test(...)
1918
3
CODE:
1919{
1920
9
        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
1921
6
        short int exact = -1;
1922
6
        const char *restrict alternative = "two.sided";
1923        int arg_idx = 0;
1924
1925
12
        // Shift arrays if provided positionally
1926        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1927                x_sv = ST(arg_idx);
1928
3
                arg_idx++;
1929
3
        }
1930
12
        // Check if second argument is an array (2-sample) or a string representing a CDF (1-sample)
1931
9
        if (arg_idx < items) {
1932
9
                if (SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
1933                        y_sv = ST(arg_idx);
1934
3
                        arg_idx++;
1935
12
                } else if (SvPOK(ST(arg_idx))) {
1936
3
                        y_sv = ST(arg_idx); // Save string (e.g., "pnorm") for 1-sample test logic
1937
3
                        arg_idx++;
1938                }
1939
4
        }
1940
4
        // Parse named arguments
1941        for (; arg_idx < items; arg_idx += 2) {
1942
4
          const char *restrict key = SvPV_nolen(ST(arg_idx));
1943
4
          SV *restrict val = ST(arg_idx + 1);
1944
16
          if      (strEQ(key, "x"))           x_sv = val;
1945
12
          else if (strEQ(key, "y"))           y_sv = val;
1946
12
          else if (strEQ(key, "exact"))       {
1947                   if (!SvOK(val)) exact = -1;
1948
4
                   else exact = SvTRUE(val) ? 1 : 0;
1949
4
          }
1950          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
1951
4
          else croak("ks_test: unknown argument '%s'", key);
1952
4
        }
1953
1954
8
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
1955          croak("ks_test: 'x' is a required argument and must be an ARRAY reference");
1956
4
        }
1957
1958
4
        bool is_two_sided = strEQ(alternative, "two.sided") ? 1 : 0;
1959
4
        bool is_greater   = strEQ(alternative, "greater") ? 1 : 0;
1960        bool is_less      = strEQ(alternative, "less") ? 1 : 0;
1961
1962
8
        if (!is_two_sided && !is_greater && !is_less) {
1963
8
          croak("ks_test: alternative must be 'two.sided', 'less', or 'greater'");
1964        }
1965
1966
8
        AV *restrict x_av = (AV*)SvRV(x_sv);
1967        size_t nx = av_len(x_av) + 1;
1968
33
        if (nx == 0) croak("Not enough 'x' observations");
1969
1970
26
        // Extract 'x' array to C-array
1971
26
        double *restrict x_data = (double *)safemalloc(nx * sizeof(double));
1972
26
        size_t valid_nx = 0;
1973
15
        for (size_t i = 0; i < nx; i++) {
1974
1
          SV**restrict el = av_fetch(x_av, i, 0);
1975
1
          if (el && SvOK(*el) && looks_like_number(*el)) {
1976
1
                   x_data[valid_nx++] = SvNV(*el);
1977
1
          }
1978
1
        }
1979
1980        double statistic = 0.0, p_value = 0.0;
1981
14
        const char *restrict method_desc = "";
1982
1983
11
        // --- TWO SAMPLE ---
1984        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
1985          AV *restrict y_av = (AV*)SvRV(y_sv);
1986
7
          size_t ny = av_len(y_av) + 1;
1987
1988
3
          double *restrict y_data = (double *)safemalloc(ny * sizeof(double));
1989          size_t valid_ny = 0;
1990
8
          for (size_t i = 0; i < ny; i++) {
1991
5
                   SV**restrict el = av_fetch(y_av, i, 0);
1992
5
                   if (el && SvOK(*el) && looks_like_number(*el)) {
1993
5
                       y_data[valid_ny++] = SvNV(*el);
1994                   }
1995
20
          }
1996
1997
15
          if (valid_nx < 1 || valid_ny < 1) {
1998
15
                   Safefree(x_data); Safefree(y_data);
1999                   croak("Not enough non-missing observations for KS test");
2000          }
2001
2002
2
          double d, d_plus, d_minus;
2003
8
          calc_2sample_stats(x_data, valid_nx, y_data, valid_ny, &d, &d_plus, &d_minus);
2004
2005
6
          // Map alternative to the correct statistic
2006          if (is_greater) statistic = d_plus;
2007          else if (is_less) statistic = d_minus;
2008
3
          else statistic = d;
2009
2010
12
          // Determine if exact or asymptotic
2011
9
          bool use_exact = FALSE;
2012
9
          if (exact == 1) use_exact = TRUE;
2013          else if (exact == 0) use_exact = FALSE;
2014
3
          else use_exact = (valid_nx * valid_ny < 10000);
2015
2016
3
          // Check for ties in combined set
2017          size_t total_n = valid_nx + valid_ny;
2018
5
          double *restrict comb = (double *)safemalloc(total_n * sizeof(double));
2019
5
          for(size_t i=0; i<valid_nx; i++) comb[i] = x_data[i];
2020
0
          for(size_t i=0; i<valid_ny; i++) comb[valid_nx+i] = y_data[i];
2021
0
          qsort(comb, total_n, sizeof(double), compare_doubles);
2022
2023
0
          bool has_ties = FALSE;
2024
0
          for(size_t i = 1; i < total_n; i++) {
2025
0
                   if(comb[i] == comb[i-1]) { has_ties = TRUE; break; }
2026
0
          }
2027
0
          Safefree(comb);
2028
0
          if (use_exact && has_ties) {
2029                   warn("cannot compute exact p-value with ties; falling back to asymptotic");
2030                   use_exact = FALSE;
2031
0
          }
2032
0
          if (use_exact) {
2033                   method_desc = "Two-sample Kolmogorov-Smirnov exact test";
2034
5
                   double q = (0.5 + floor(statistic * valid_nx * valid_ny - 1e-7)) / ((double)valid_nx * valid_ny);
2035
5
                   p_value = psmirnov_exact_uniq_upper(q, valid_nx, valid_ny, is_two_sided);
2036
5
          } else {
2037
5
                   method_desc = "Two-sample Kolmogorov-Smirnov test (asymptotic)";
2038
20
                   double z = statistic * sqrt((double)(valid_nx * valid_ny) / (valid_nx + valid_ny));
2039
15
                   if (is_two_sided) {
2040
15
                       p_value = K2l(z, 0, 1e-9);
2041                   } else {
2042
5
                       p_value = exp(-2.0 * z * z); // One-sided limit distribution
2043
5
                   }
2044
5
          }
2045
32
          Safefree(y_data);
2046
27
        } else if (y_sv && SvPOK(y_sv)) {// --- ONE SAMPLE (e.g. against pnorm) ---
2047
27
                const char *restrict dist = SvPV_nolen(y_sv);
2048
23
                if (strEQ(dist, "pnorm")) {
2049
0
                        qsort(x_data, valid_nx, sizeof(double), compare_doubles);
2050
0
                        double max_d = 0.0, max_d_plus = 0.0, max_d_minus = 0.0;
2051
0
                        for(size_t i = 0; i < valid_nx; i++) {
2052
0
                                double cdf_obs_low  = (double)i / valid_nx;
2053
0
                                double cdf_obs_high = (double)(i + 1) / valid_nx;
2054
0
                                double cdf_theor    = approx_pnorm(x_data[i]);
2055
0
                                double diff1 = cdf_obs_low - cdf_theor;
2056
0
                                double diff2 = cdf_obs_high - cdf_theor;
2057
0
                                if (diff1 > max_d_plus) max_d_plus = diff1;
2058
0
                                if (diff2 > max_d_plus) max_d_plus = diff2;
2059                                if (-diff1 > max_d_minus) max_d_minus = -diff1;
2060
0
                                if (-diff2 > max_d_minus) max_d_minus = -diff2;
2061                                if (fabs(diff1) > max_d) max_d = fabs(diff1);
2062
0
                                if (fabs(diff2) > max_d) max_d = fabs(diff2);
2063                        }
2064                        if (is_greater) statistic = max_d_plus;
2065
0
                        else if (is_less) statistic = max_d_minus;
2066                        else statistic = max_d;
2067                        bool use_exact = (exact == -1) ? (valid_nx < 100) : (exact == 1);
2068                        if (use_exact) {
2069
23
                                method_desc = "One-sample Kolmogorov-Smirnov exact test";
2070
23
                                if (is_two_sided) {
2071                                        p_value = 1.0 - K2x(valid_nx, statistic);
2072                                } else {
2073
108
                                        warn("exact 1-sample 1-sided KS test not implemented; using asymptotic");
2074
81
                                        double z = statistic * sqrt((double)valid_nx);
2075
81
                                        p_value = exp(-2.0 * z * z);
2076
81
                                }
2077
162
                        } else {
2078
81
                                 method_desc = "One-sample Kolmogorov-Smirnov test (asymptotic)";
2079
81
                                 double z = statistic * sqrt((double)valid_nx);
2080
81
                                 if (is_two_sided) p_value = K2l(z, 0, 1e-6);
2081
46
                                 else p_value = exp(-2.0 * z * z);
2082
0
                        }
2083
0
                } else {
2084
0
                         Safefree(x_data);
2085
0
                         croak("ks_test: Unsupported 1-sample distribution '%s'. Use arrays for 2-sample.", dist);
2086                }
2087
46
        } else {
2088          Safefree(x_data);
2089
35
          croak("ks_test: Invalid arguments for 'y'.");
2090        }
2091        Safefree(x_data);
2092
0
        if (p_value > 1.0) p_value = 1.0;
2093        if (p_value < 0.0) p_value = 0.0;
2094        HV *restrict res = newHV();
2095
27
        hv_stores(res, "statistic", newSVnv(statistic));
2096
27
        hv_stores(res, "p_value", newSVnv(p_value));
2097        hv_stores(res, "method", newSVpv(method_desc, 0));
2098
5
        hv_stores(res, "alternative", newSVpv(alternative, 0));
2099
3
        RETVAL = newRV_noinc((SV*)res);
2100
3
}
2101
3
OUTPUT:
2102
4
        RETVAL
2103
2104
3
SV* wilcox_test(...)
2105
2
CODE:
2106
2
{
2107        SV *restrict x_sv = NULL, *restrict y_sv = NULL;
2108        bool paired = FALSE, correct = TRUE;
2109
2
        double mu = 0.0;
2110
7
        short int exact = -1;
2111
5
        const char *restrict alternative = "two.sided";
2112
5
        int arg_idx = 0;
2113
5
        // 1. Shift first positional argument as 'x' if it's an array reference
2114
5
        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
2115                x_sv = ST(arg_idx);
2116
14
                arg_idx++;
2117
9
        }
2118        // 2. Shift second positional argument as 'y' if it's an array reference
2119        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
2120                y_sv = ST(arg_idx);
2121
2
                arg_idx++;
2122
2
        }
2123
7
        // Ensure the remaining arguments form complete key-value pairs
2124
5
        if ((items - arg_idx) % 2 != 0) {
2125
5
                croak("Usage: wilcox_test(\\@x, [\\@y], key => value, ...)");
2126        }
2127
2
        // --- Parse named arguments from the remaining flat stack ---
2128
7
        for (; arg_idx < items; arg_idx += 2) {
2129
2
                const char *restrict key = SvPV_nolen(ST(arg_idx));
2130
2
                SV *restrict val = ST(arg_idx + 1);
2131                if      (strEQ(key, "x"))          x_sv = val;
2132
3
                else if (strEQ(key, "y"))          y_sv = val;
2133
0
                else if (strEQ(key, "paired"))     paired = SvTRUE(val);
2134
0
                else if (strEQ(key, "correct"))    correct = SvTRUE(val);
2135
0
                else if (strEQ(key, "mu"))          mu = SvNV(val);
2136
0
                else if (strEQ(key, "exact"))       {
2137
0
                        if (!SvOK(val)) exact = -1;
2138
0
                        else exact = SvTRUE(val) ? 1 : 0;
2139
0
                }
2140
0
                else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
2141                else croak("wilcox_test: unknown argument '%s'", key);
2142        }
2143
0
        // --- Validate required / types ---
2144
0
        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
2145                croak("wilcox_test: 'x' is a required argument and must be an ARRAY reference");
2146
3
        AV *restrict x_av = (AV*)SvRV(x_sv);
2147
3
        size_t nx = av_len(x_av) + 1;
2148
3
        if (nx == 0) croak("Not enough 'x' observations");
2149
2150
10
        AV *restrict y_av = NULL;
2151
7
        size_t ny = 0;
2152
7
        if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV) {
2153                y_av = (AV*)SvRV(y_sv);
2154
3
                ny = av_len(y_av) + 1;
2155
3
        }
2156
3
        double p_value = 0.0, statistic = 0.0;
2157
10
        const char *restrict method_desc = "";
2158
7
        bool use_exact = FALSE;
2159
7
        // --- TWO SAMPLE (Mann-Whitney) ---
2160
7
        if (ny > 0 && !paired) {
2161
7
                RankInfo *restrict ri = (RankInfo *)safemalloc((nx + ny) * sizeof(RankInfo));
2162
3
                size_t valid_nx = 0, valid_ny = 0;
2163
0
                for (size_t i = 0; i < nx; i++) {
2164
0
                        SV**restrict el = av_fetch(x_av, i, 0);
2165
0
                        if (el && SvOK(*el) && looks_like_number(*el)) {
2166
0
                                ri[valid_nx].val = SvNV(*el) - mu; // R subtracts mu from x
2167
0
                                ri[valid_nx].idx = 1;
2168
0
                                valid_nx++;
2169
0
                        }
2170                }
2171
0
                for (size_t i = 0; i < ny; i++) {
2172                        SV**restrict el = av_fetch(y_av, i, 0);
2173
0
                        if (el && SvOK(*el) && looks_like_number(*el)) {
2174                                ri[valid_nx + valid_ny].val = SvNV(*el);
2175                                ri[valid_nx + valid_ny].idx = 2;
2176                                valid_ny++;
2177
3
                        }
2178
3
                }
2179                if (valid_nx == 0) { Safefree(ri); croak("not enough (non-missing) 'x' observations"); }
2180                if (valid_ny == 0) { Safefree(ri); croak("not enough 'y' observations"); }
2181                size_t total_n = valid_nx + valid_ny;
2182
24
                bool has_ties = 0;
2183
17
                double tie_adj = rank_and_count_ties(ri, total_n, &has_ties);
2184
17
                double w_rank_sum = 0.0;
2185
17
                for (size_t i = 0; i < total_n; i++) if (ri[i].idx == 1) w_rank_sum += ri[i].rank;
2186
17
                statistic = w_rank_sum - (double)valid_nx * (valid_nx + 1.0) / 2.0;
2187
2188
0
                if (exact == 1) use_exact = TRUE;
2189
0
                else if (exact == 0) use_exact = FALSE;
2190
0
                else use_exact = (valid_nx < 50 && valid_ny < 50 && !has_ties);
2191
2192                if (use_exact && has_ties) {
2193
13
                        warn("cannot compute exact p-value with ties; falling back to approximation");
2194                        use_exact = FALSE;
2195
4
                }
2196                if (use_exact) {
2197                        method_desc = "Wilcoxon rank sum exact test";
2198
7
                        double p_less = exact_pwilcox(statistic, valid_nx, valid_ny);
2199
7
                        double p_greater = 1.0 - exact_pwilcox(statistic - 1.0, valid_nx, valid_ny);
2200
2201
3
                        if (strcmp(alternative, "less") == 0) p_value = p_less;
2202                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
2203
11
                        else {
2204
11
                                double p = (p_less < p_greater) ? p_less : p_greater;
2205
11
                                p_value = 2.0 * p;
2206
11
                        }
2207                } else {
2208                        method_desc = correct ? "Wilcoxon rank sum test with continuity correction" : "Wilcoxon rank sum test";
2209                        double exp = (double)valid_nx * valid_ny / 2.0;
2210                        double var = ((double)valid_nx * valid_ny / 12.0) * ((total_n + 1.0) - tie_adj / (total_n * (total_n - 1.0)));
2211                        double z = statistic - exp;
2212
2213
13
                        double CORRECTION = 0.0;
2214
13
                        if (correct) {
2215
13
                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
2216
13
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
2217                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
2218                        }
2219
13
                        z = (z - CORRECTION) / sqrt(var);
2220
2221
13
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
2222
13
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
2223                        else p_value = 2.0 * approx_pnorm(-fabs(z));
2224
0
                }
2225                Safefree(ri);
2226
13
        } else { // --- ONE SAMPLE / PAIRED ---
2227
13
                if (paired && (!y_av || nx != ny)) croak("'x' and 'y' must have the same length for paired test");
2228                double *restrict diffs = (double *)safemalloc(nx * sizeof(double));
2229
13
                size_t n_nz = 0;
2230
13
                bool has_zeroes = FALSE;
2231
0
                for (size_t i = 0; i < nx; i++) {
2232                        SV**restrict x_el = av_fetch(x_av, i, 0);
2233
13
                        if (!x_el || !SvOK(*x_el) || !looks_like_number(*x_el)) continue;
2234                        double dx = SvNV(*x_el);
2235
2236
3962
                        if (paired) {
2237
3962
                                SV**restrict y_el = av_fetch(y_av, i, 0);
2238                                if (!y_el || !SvOK(*y_el) || !looks_like_number(*y_el)) continue;
2239
3962
                                double dy = SvNV(*y_el);
2240
3960
                                double d = dx - dy - mu;
2241
3960
                                if (d == 0.0) has_zeroes = TRUE; // Drop exact zeroes
2242
3696
                                else diffs[n_nz++] = d;
2243                        } else {
2244                                double d = dx - mu;
2245
3962
                                if (d == 0.0) has_zeroes = TRUE;
2246                                else diffs[n_nz++] = d;
2247
3961
                        }
2248
3962
                }
2249
3962
                if (n_nz == 0) {
2250                        Safefree(diffs);
2251
3961
                        croak("not enough (non-missing) observations");
2252                }
2253                RankInfo *ri = (RankInfo *)safemalloc(n_nz * sizeof(RankInfo));
2254
3961
                for (size_t i = 0; i < n_nz; i++) {
2255
0
                        ri[i].val = fabs(diffs[i]);
2256                        ri[i].idx = (diffs[i] > 0);
2257                }
2258                bool has_ties = 0;
2259
293588
                double tie_adj = rank_and_count_ties(ri, n_nz, &has_ties);
2260
289626
                statistic = 0.0;
2261
289626
                for (size_t i = 0; i < n_nz; i++) {
2262
289625
                        if (ri[i].idx) statistic += ri[i].rank;
2263
22325
                }
2264
3
                if (exact == 1) use_exact = TRUE;
2265
3
                else if (exact == 0) use_exact = FALSE;
2266
22322
                else use_exact = (n_nz < 50 && !has_ties);
2267
11160
                if (use_exact && has_ties) {
2268
11160
                        warn("cannot compute exact p-value with ties; falling back to approximation");
2269
11162
                        use_exact = FALSE;
2270
11161
                }
2271                if (use_exact && has_zeroes) {
2272
267300
                        warn("cannot compute exact p-value with zeroes; falling back to approximation");
2273
51615
                        use_exact = FALSE;
2274
51615
                }
2275
51615
                if (use_exact) {
2276
51615
                        method_desc = paired ? "Wilcoxon exact signed rank test" : "Wilcoxon exact signed rank test";
2277                        double p_less = exact_psignrank(statistic, n_nz);
2278
215685
                        double p_greater = 1.0 - exact_psignrank(statistic - 1.0, n_nz);
2279
2280                        if (strcmp(alternative, "less") == 0) p_value = p_less;
2281
3962
                        else if (strcmp(alternative, "greater") == 0) p_value = p_greater;
2282                        else {
2283
2
                                double p = (p_less < p_greater) ? p_less : p_greater;
2284                                p_value = 2.0 * p;
2285
3960
                        }
2286                } else {
2287
3960
                        method_desc = correct ? "Wilcoxon signed rank test with continuity correction" : "Wilcoxon signed rank test";
2288
3960
                        double exp = (double)n_nz * (n_nz + 1.0) / 4.0;
2289                        double var = (n_nz * (n_nz + 1.0) * (2.0 * n_nz + 1.0) / 24.0) - (tie_adj / 48.0);
2290
3960
                        double z = statistic - exp;
2291
3960
                        double CORRECTION = 0.0;
2292
3960
                        if (correct) {
2293
3960
                                if (strcmp(alternative, "two.sided") == 0) CORRECTION = (z > 0 ? 0.5 : -0.5);
2294
3960
                                else if (strcmp(alternative, "greater") == 0) CORRECTION = 0.5;
2295
3960
                                else if (strcmp(alternative, "less") == 0) CORRECTION = -0.5;
2296
3960
                        }
2297
3960
                        z = (z - CORRECTION) / sqrt(var);
2298
2299
3960
                        if (strcmp(alternative, "less") == 0) p_value = approx_pnorm(z);
2300
3960
                        else if (strcmp(alternative, "greater") == 0) p_value = 1.0 - approx_pnorm(z);
2301                        else p_value = 2.0 * approx_pnorm(-fabs(z));
2302
0
                }
2303                Safefree(ri); Safefree(diffs);
2304
3960
        }
2305        if (p_value > 1.0) p_value = 1.0;
2306        HV *restrict res = newHV();
2307
13
        hv_stores(res, "statistic", newSVnv(statistic));
2308
13
        hv_stores(res, "p_value", newSVnv(p_value));
2309        hv_stores(res, "method", newSVpv(method_desc, 0));
2310
13
        hv_stores(res, "alternative", newSVpv(alternative, 0));
2311
1
        RETVAL = newRV_noinc((SV*)res);
2312
1
}
2313
1
OUTPUT:
2314
1
        RETVAL
2315
2316
1
SV* chisq_test(data_ref)
2317
1
    SV* data_ref;
2318
1
CODE:
2319
1
{
2320
1
    // 1. Input Validation (mimics: die 'Input must be an array reference')
2321
1
    if (!SvROK(data_ref) || SvTYPE(SvRV(data_ref)) != SVt_PVAV) {
2322
1
        croak("Input must be an array reference");
2323    }
2324
2325    AV*restrict obs_av = (AV*)SvRV(data_ref);
2326
1
    int r = av_top_index(obs_av) + 1, c = 0;
2327    bool is_2d = 0;
2328
13
    SV**restrict first_elem = av_fetch(obs_av, 0, 0);
2329
13
    if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
2330
13
        is_2d = 1;
2331
13
        AV*restrict first_row = (AV*)SvRV(*first_elem);
2332        c = av_top_index(first_row) + 1;
2333
0
    } else {
2334        c = r;
2335        r = 1;
2336    }
2337    double stat = 0.0, grand_total = 0.0;
2338    int df = 0;
2339    bool yates = (is_2d && r == 2 && c == 2) ? 1 : 0;
2340    AV*restrict expected_av = newAV();
2341    if (is_2d) {
2342
4
        double *restrict row_sum = (double*)safemalloc(r * sizeof(double));
2343
0
        double *restrict col_sum = (double*)safemalloc(c * sizeof(double));
2344        for(unsigned int i=0; i<r; i++) row_sum[i] = 0.0;
2345
4
        for(unsigned int j=0; j<c; j++) col_sum[j] = 0.0;
2346
0
        for (unsigned int i = 0; i < r; i++) {
2347            SV**restrict row_sv = av_fetch(obs_av, i, 0);
2348            AV*restrict row = (AV*)SvRV(*row_sv);
2349            for (unsigned int j = 0; j < c; j++) {
2350
4
                 SV**restrict val_sv = av_fetch(row, j, 0);
2351
2
                 double val = SvNV(*val_sv);
2352
1
                 row_sum[i] += val;
2353
0
                 col_sum[j] += val;
2354                 grand_total += val;
2355            }
2356
4
        }
2357
4
        for (unsigned int i = 0; i < r; i++) {
2358
4
            AV*restrict exp_row = newAV();
2359
4
            SV**restrict row_sv = av_fetch(obs_av, i, 0);
2360            AV*restrict row = (AV*)SvRV(*row_sv);
2361
4
            for (unsigned int j = 0; j < c; j++) {
2362
0
                double E = (row_sum[i] * col_sum[j]) / grand_total;
2363                SV**restrict val_sv = av_fetch(row, j, 0);
2364                double O = SvNV(*val_sv);
2365                av_push(exp_row, newSVnv(E));
2366                if (yates) {
2367                  // Exact R logic: min(0.5, abs(O - E))
2368
4
                  double abs_diff = fabs(O - E);
2369
4
                  double y_corr = (abs_diff > 0.5) ? 0.5 : abs_diff;
2370
4
                  double diff = abs_diff - y_corr;
2371                  stat += (diff * diff) / E;
2372
24
                } else {
2373
20
                  stat += ((O - E) * (O - E)) / E;
2374
20
                }
2375            }
2376            av_push(expected_av, newRV_noinc((SV*)exp_row));
2377
20
        }
2378
20
        safefree(row_sum); safefree(col_sum);
2379        df = (r - 1) * (c - 1);
2380    } else {
2381
20
      for (unsigned int j = 0; j < c; j++) {
2382
20
           SV**restrict val_sv = av_fetch(obs_av, j, 0);
2383
20
           grand_total += SvNV(*val_sv);
2384
20
      }
2385      double E = grand_total / (double)c;
2386      for (unsigned int j = 0; j < c; j++) {
2387           SV**restrict val_sv = av_fetch(obs_av, j, 0);
2388           double O = SvNV(*val_sv);
2389
4
           av_push(expected_av, newSVnv(E));
2390
0
           stat += ((O - E) * (O - E)) / E;
2391
0
      }
2392      df = c - 1;
2393
4
    }
2394    double p_val = get_p_value(stat, df);
2395
2396    // 2. Build the top-level results Hash (mimicking R's htest structure)
2397
6
    HV*restrict results = newHV();
2398
2399
25
    // 'statistic' => { 'X-squared' => stat }
2400
25
    HV*restrict statistic_hv = newHV();
2401
25
    hv_store(statistic_hv, "X-squared", 9, newSVnv(stat), 0);
2402    hv_store(results, "statistic", 9, newRV_noinc((SV*)statistic_hv), 0);
2403
2404    // 'parameter' => { 'df' => df }
2405
3
    HV*restrict parameter_hv = newHV();
2406
3
    hv_store(parameter_hv, "df", 2, newSViv(df), 0);
2407    hv_store(results, "parameter", 9, newRV_noinc((SV*)parameter_hv), 0);
2408
2409
1
    // 'p.value' => p_val
2410    hv_store(results, "p.value", 7, newSVnv(p_val), 0);
2411
2412
1
    // 'expected' => expected_av
2413
1
    hv_store(results, "expected", 8, newRV_noinc((SV*)expected_av), 0);
2414
2415
6
    // 'observed' => data_ref (Increment ref count since hv_store consumes ownership)
2416
5
    hv_store(results, "observed", 8, SvREFCNT_inc(data_ref), 0);
2417
2418
5
    // 'data.name' => 'Perl ArrayRef'
2419
5
    hv_store(results, "data.name", 9, newSVpv("Perl ArrayRef", 0), 0);
2420
2421    // 'method' => String
2422    if (is_2d) {
2423
1
        if (yates) {
2424
1
            hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test with Yates' continuity correction", 0), 0);
2425        } else {
2426            hv_store(results, "method", 6, newSVpv("Pearson's Chi-squared test", 0), 0);
2427
12
        }
2428
10
    } else {
2429
10
        hv_store(results, "method", 6, newSVpv("Chi-squared test for given probabilities", 0), 0);
2430
10
    }
2431
2432
10
    RETVAL = newRV_noinc((SV*)results);
2433}
2434OUTPUT:
2435    RETVAL
2436
2437
3
PROTOTYPES: ENABLE
2438
2439void write_table(...)
2440
4
PPCODE:
2441
4
{
2442
4
    SV *restrict data_sv = NULL;
2443    SV *restrict file_sv = NULL;
2444    unsigned int arg_idx = 0;
2445
2446    // Mimic the Perl shift logic
2447    if (arg_idx < items && SvROK(ST(arg_idx))) {
2448        int type = SvTYPE(SvRV(ST(arg_idx)));
2449        if (type == SVt_PVHV || type == SVt_PVAV) {
2450            data_sv = ST(arg_idx);
2451
7
            arg_idx++;
2452
7
        }
2453
7
    }
2454    if (arg_idx < items) {
2455        file_sv = ST(arg_idx);
2456        arg_idx++;
2457    }
2458
2459
7
    const char *restrict sep = ",";
2460
7
    bool explicit_sep = 0; // Track if delimiter was manually specified
2461
7
    const char *restrict undef_val = "NA";
2462
7
    SV *restrict row_names_sv = sv_2mortal(newSViv(1));
2463
7
    SV *restrict col_names_sv = NULL;
2464
2465
7
    // Read the remaining Hash-style arguments
2466
7
    for (; arg_idx < items; arg_idx += 2) {
2467        if (arg_idx + 1 >= items) croak("write_table: Odd number of arguments passed");
2468
7
        const char *restrict key = SvPV_nolen(ST(arg_idx));
2469
7
        SV *restrict val = ST(arg_idx + 1);
2470
7
        if (strEQ(key, "data")) data_sv = val;
2471
7
        else if (strEQ(key, "col.names")) col_names_sv = val;
2472
7
        else if (strEQ(key, "file")) file_sv = val;
2473        else if (strEQ(key, "row.names")) row_names_sv = val;
2474
7
        // NEW: Check for either "sep" or "delim" and mark as explicitly provided
2475
7
        else if (strEQ(key, "sep") || strEQ(key, "delim")) {
2476
7
            sep = SvPV_nolen(val);
2477
7
            explicit_sep = 1;
2478        }
2479        else if (strEQ(key, "undef.val")) undef_val = SvPV_nolen(val);
2480        else croak("write_table: Unknown arguments passed: %s", key);
2481    }
2482
2483
7
    if (!data_sv || !SvROK(data_sv)) {
2484        croak("write_table: 'data' must be a HASH or ARRAY reference\n");
2485
26
    }
2486
19
    SV *restrict data_ref = SvRV(data_sv);
2487
19
    if (SvTYPE(data_ref) != SVt_PVHV && SvTYPE(data_ref) != SVt_PVAV) {
2488
19
        croak("write_table: 'data' must be a HASH or ARRAY reference\n");
2489
12
    }
2490
2491
0
    if (!file_sv || !SvOK(file_sv)) croak("write_table: file name missing\n");
2492    const char *restrict file = SvPV_nolen(file_sv);
2493
2494
7
    // NEW: Auto-detect separator from file extension if not overridden
2495    if (!explicit_sep) {
2496
7
        size_t file_len = strlen(file);
2497
7
        if (file_len >= 4) {
2498
7
            const char *restrict ext = file + file_len - 4;
2499            if (strEQ(ext, ".tsv") || strEQ(ext, ".TSV")) {
2500                sep = "\t";
2501
7
            } else if (strEQ(ext, ".csv") || strEQ(ext, ".CSV")) {
2502
7
                sep = ",";
2503
7
            }
2504        }
2505
7
    }
2506
2507
7
    if (col_names_sv && SvOK(col_names_sv)) {
2508        if (!SvROK(col_names_sv) || SvTYPE(SvRV(col_names_sv)) != SVt_PVAV) {
2509
7
            croak("write_table: 'col.names' must be an ARRAY reference\n");
2510
7
        }
2511
7
    }
2512
2513    bool is_hoh = 0, is_hoa = 0, is_aoh = 0;
2514
7
    AV *restrict rows_av = NULL;
2515
2516    // Validate Input Structures & Homogeneity
2517
7
    if (SvTYPE(data_ref) == SVt_PVHV) {
2518
20
        HV *restrict hv = (HV*)data_ref;
2519
13
        if (hv_iterinit(hv) == 0) XSRETURN_EMPTY;
2520
2521
0
        HE *restrict entry = hv_iternext(hv);
2522        SV *restrict first_val = hv_iterval(hv, entry);
2523
13
        if (!first_val || !SvROK(first_val)) {
2524
0
            croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
2525
0
        }
2526        int first_type = SvTYPE(SvRV(first_val));
2527
13
        if (first_type != SVt_PVHV && first_type != SVt_PVAV) {
2528
13
            croak("write_table: Data values must be either all HASHes or all ARRAYs\n");
2529
0
        }
2530
0
        is_hoh = (first_type == SVt_PVHV);
2531
0
        is_hoa = (first_type == SVt_PVAV);
2532
0
        hv_iterinit(hv);
2533        while ((entry = hv_iternext(hv))) {
2534
0
            SV *restrict val = hv_iterval(hv, entry);
2535
0
            if (!val || !SvROK(val) || SvTYPE(SvRV(val)) != first_type) {
2536
0
                croak("write_table: Mixed data types detected. Ensure all values are %s references.\n", is_hoh ? "HASH" : "ARRAY");
2537
0
            }
2538
0
        }
2539        if (is_hoh) {
2540
13
            rows_av = newAV();
2541
13
            hv_iterinit(hv);
2542
13
            while ((entry = hv_iternext(hv))) {
2543                av_push(rows_av, newSVsv(hv_iterkeysv(entry)));
2544
13
            }
2545        }
2546    } else {
2547
27
        AV *restrict av = (AV*)data_ref;
2548
20
        if (av_len(av) < 0) XSRETURN_EMPTY;
2549
39
        SV **restrict first_ptr = av_fetch(av, 0, 0);
2550
19
        if (!first_ptr || !*first_ptr || !SvROK(*first_ptr) || SvTYPE(SvRV(*first_ptr)) != SVt_PVHV) {
2551            croak("write_table: For ARRAY data, all elements must be HASH references (Array of Hashes)\n");
2552
20
        }
2553
2554
7
        for (size_t i = 0; i <= av_len(av); i++) {
2555            SV **restrict ptr = av_fetch(av, i, 0);
2556            if (!ptr || !*ptr || !SvROK(*ptr) || SvTYPE(SvRV(*ptr)) != SVt_PVHV) {
2557
7
                croak("write_table: Mixed data types detected in Array of Hashes. All elements must be HASH references.\n");
2558
7
            }
2559
7
        }
2560
7
        is_aoh = 1;
2561
7
    }
2562
2563
7
    PerlIO *restrict fh = PerlIO_open(file, "w");
2564
7
    if (!fh) croak("write_table: Could not open '%s' for writing", file);
2565
2566
2
    AV *restrict headers_av = newAV();
2567
2
    bool inc_rownames = (row_names_sv && SvTRUE(row_names_sv)) ? 1 : 0;
2568
122
    const char *restrict rownames_col = NULL;
2569
2570
120
    // ----- Hash of Hashes -----
2571    if (is_hoh) {
2572
5
        if (col_names_sv && SvOK(col_names_sv)) {
2573
5
            AV *restrict c_av = (AV*)SvRV(col_names_sv);
2574
5
            for(size_t i=0; i<=av_len(c_av); i++) {
2575
5
                SV **restrict c = av_fetch(c_av, i, 0);
2576
165
                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2577            }
2578
160
        } else {
2579
160
            HV *restrict col_map = newHV();
2580
160
            hv_iterinit((HV*)data_ref);
2581            HE *restrict entry;
2582
0
            while((entry = hv_iternext((HV*)data_ref))) {
2583                HV *restrict inner = (HV*)SvRV(hv_iterval((HV*)data_ref, entry));
2584
0
                hv_iterinit(inner);
2585
0
                HE *restrict inner_entry;
2586
0
                while((inner_entry = hv_iternext(inner))) {
2587
0
                    hv_store_ent(col_map, hv_iterkeysv(inner_entry), newSViv(1), 0);
2588
0
                }
2589
0
            }
2590
0
            unsigned num_cols = hv_iterinit(col_map);
2591
0
            const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2592
0
            for(unsigned i=0; i<num_cols; i++) {
2593
0
                HE *restrict ce = hv_iternext(col_map);
2594                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2595
0
            }
2596
0
            qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2597
0
            for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2598            safefree(col_array);
2599            SvREFCNT_dec(col_map);
2600
0
    }
2601    size_t num_headers = av_len(headers_av) + 1;
2602    const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2603
2604
20
    size_t h_idx = 0;
2605
0
    if (inc_rownames) header_row[h_idx++] = "";
2606
0
    for(unsigned short int i=0; i<num_headers; i++) {
2607
0
        SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2608        header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2609
20
    }
2610
7
    print_string_row(fh, header_row, h_idx, sep);
2611    safefree(header_row);
2612
2613
1
    size_t num_rows = av_len(rows_av) + 1;
2614
1
    const char **restrict row_array = safemalloc(num_rows * sizeof(char*));
2615
61
    for(size_t i=0; i<num_rows; i++) {
2616
60
        row_array[i] = SvPV_nolen(*av_fetch(rows_av, i, 0));
2617
60
    }
2618
60
    qsort(row_array, num_rows, sizeof(char*), cmp_string_wt);
2619
2620
88
    HV *restrict data_hv = (HV*)data_ref;
2621    const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2622
2623
2
    for(size_t i=0; i<num_rows; i++) {
2624
2
        size_t d_idx = 0;
2625        if (inc_rownames) row_data[d_idx++] = row_array[i];
2626
2627        SV **restrict inner_hv_ptr = hv_fetch(data_hv, row_array[i], strlen(row_array[i]), 0);
2628        HV *restrict inner_hv = inner_hv_ptr ? (HV*)SvRV(*inner_hv_ptr) : NULL;
2629
2630
2
        for(size_t j=0; j<num_headers; j++) {
2631
2
            SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2632
1
            const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2633
1
            SV **restrict cell_ptr = inner_hv ? hv_fetch(inner_hv, col_name, strlen(col_name), 0) : NULL;
2634            if (cell_ptr && SvOK(*cell_ptr)) {
2635            if (SvROK(*cell_ptr)) {
2636              PerlIO_close(fh);
2637
2
              safefree(row_array); safefree(row_data);
2638
1
              if (headers_av) SvREFCNT_dec(headers_av);
2639
0
              if (rows_av) SvREFCNT_dec(rows_av);
2640
0
              croak("write_table: Cannot write nested reference types to table\n");
2641
0
            }
2642                row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2643
1
            } else {
2644
1
                row_data[d_idx++] = undef_val;
2645
1
            }
2646
1
        }
2647
1
        print_string_row(fh, row_data, d_idx, sep);
2648    }
2649
3
    safefree(row_array); safefree(row_data);
2650
2651    } else if (is_hoa) { // ----- Hash of Arrays -----
2652
0
        HV *restrict data_hv = (HV*)data_ref;
2653        size_t max_rows = 0;
2654        hv_iterinit(data_hv);
2655
12
        HE *restrict entry;
2656        while((entry = hv_iternext(data_hv))) {
2657            AV *restrict arr = (AV*)SvRV(hv_iterval(data_hv, entry));
2658
7
            size_t len = av_len(arr) + 1;
2659            if (len > max_rows) max_rows = len;
2660
7
        }
2661
2662        if (col_names_sv && SvOK(col_names_sv)) {
2663            AV *restrict c_av = (AV*)SvRV(col_names_sv);
2664
287
            for(size_t i=0; i<=av_len(c_av); i++) {
2665
280
                SV **restrict c = av_fetch(c_av, i, 0);
2666
280
                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2667            }
2668
280
        } else {
2669
280
            unsigned int num_cols = hv_iterinit(data_hv);
2670
1060
            const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2671
780
            for(unsigned int i=0; i<num_cols; i++) {
2672
280
                HE *restrict ce = hv_iternext(data_hv);
2673
500
                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2674
60
            }
2675
60
            qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2676
60
            for(unsigned i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2677
60
            safefree(col_array);
2678
0
        }
2679        if (av_len(headers_av) < 0) croak("Could not get headers in write_table");
2680
440
        if (inc_rownames && contains_nondigit(row_names_sv)) {
2681
440
            rownames_col = SvPV_nolen(row_names_sv);
2682            AV *restrict filtered_headers = (AV*)sv_2mortal((SV*)newAV());
2683
2684
280
            for(size_t i=0; i<=av_len(headers_av); i++) {
2685
280
                SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2686
1060
                if (!h_ptr || !*h_ptr) continue;
2687
280
                SV *restrict h_sv = *h_ptr;
2688
280
                if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2689
280
                    av_push(filtered_headers, newSVsv(h_sv));
2690                }
2691
7
            }
2692
7
            SvREFCNT_dec(headers_av);
2693
0
            headers_av = filtered_headers;
2694
0
        }
2695        size_t num_headers = av_len(headers_av) + 1;
2696        const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2697
7
        size_t h_idx = 0;
2698
7
        if (inc_rownames) header_row[h_idx++] = "";
2699
7
        for(size_t i=0; i<num_headers; i++) {
2700
7
            SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2701
7
            header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2702
27
        }
2703        print_string_row(fh, header_row, h_idx, sep);
2704
7
        safefree(header_row);
2705
287
        const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2706
7
        for(size_t i=0; i<max_rows; i++) {
2707
287
            size_t d_idx = 0;
2708
280
            if (inc_rownames) {
2709
32
                if (rownames_col) {
2710
32
                    SV **restrict rn_arr_ptr = hv_fetch(data_hv, rownames_col, strlen(rownames_col), 0);
2711
32
                    if (rn_arr_ptr && SvROK(*rn_arr_ptr)) {
2712
32
                        AV *restrict rn_arr = (AV*)SvRV(*rn_arr_ptr);
2713
32
                        SV **restrict rn_val_ptr = av_fetch(rn_arr, i, 0);
2714
13
                        if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2715
0
                            if (SvROK(*rn_val_ptr)) {
2716
32
                                   PerlIO_close(fh);
2717                                   safefree(row_data);
2718
248
                                   if (headers_av) SvREFCNT_dec(headers_av);
2719
248
                                   croak("write_table: Cannot write nested reference types to table\n");
2720                             }
2721                             row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2722                         } else {
2723
21
                            row_data[d_idx++] = undef_val;
2724
805
                         }
2725
784
                    } else {
2726
288
                         row_data[d_idx++] = undef_val;
2727
288
                    }
2728
288
                } else {
2729
288
                    char buf[32];
2730
288
                    snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2731                    row_data[d_idx++] = savepv(buf);
2732
496
                }
2733
496
            }
2734            for(size_t j=0; j<num_headers; j++) {
2735                SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2736                const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2737
261
                SV **restrict arr_ptr = hv_fetch(data_hv, col_name, strlen(col_name), 0);
2738
805
                if (arr_ptr && SvROK(*arr_ptr)) {
2739
784
                    AV *restrict arr = (AV*)SvRV(*arr_ptr);
2740
3016
                    SV **restrict cell_ptr = av_fetch(arr, i, 0);
2741
2232
                    if (cell_ptr && SvOK(*cell_ptr)) {
2742
2232
                         if (SvROK(*cell_ptr)) {
2743
8688
                              PerlIO_close(fh);
2744                              safefree(row_data);
2745                              if (headers_av) SvREFCNT_dec(headers_av);
2746
21
                              croak("write_table: Cannot write nested reference types to table\n");
2747
82
                         }
2748
61
                         row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2749
61
                    } else {
2750
240
                         row_data[d_idx++] = undef_val;
2751
61
                    }
2752                } else {
2753                    row_data[d_idx++] = undef_val;
2754                }
2755
21
            }
2756
231
            print_string_row(fh, row_data, d_idx, sep);
2757
210
            if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2758
8050
        }
2759
7840
        safefree(row_data);
2760
30160
    } else if (is_aoh) {// ----- Array of Hashes -----
2761
7840
      AV *restrict data_av = (AV*)data_ref;
2762
7840
      size_t num_rows = av_len(data_av) + 1;
2763
2880
      if (col_names_sv && SvOK(col_names_sv)) {
2764           AV *restrict c_av = (AV*)SvRV(col_names_sv);
2765
2880
           for(size_t i=0; i<=av_len(c_av); i++) {
2766
2880
                SV **restrict c = av_fetch(c_av, i, 0);
2767                if(c && SvOK(*c)) av_push(headers_av, newSVsv(*c));
2768
2880
           }
2769
2880
      } else {
2770
1170
           HV *restrict col_map = newHV();
2771
0
           for(size_t i=0; i<num_rows; i++) {
2772
2880
                SV **restrict row_ptr = av_fetch(data_av, i, 0);
2773                if (row_ptr && SvROK(*row_ptr)) {
2774
4960
                    HV *restrict row_hv = (HV*)SvRV(*row_ptr);
2775
4960
                    hv_iterinit(row_hv);
2776
4960
                    HE *restrict entry;
2777                    while((entry = hv_iternext(row_hv))) {
2778                        hv_store_ent(col_map, hv_iterkeysv(entry), newSViv(1), 0);
2779                    }
2780
210
                }
2781
200
           }
2782           unsigned num_cols = hv_iterinit(col_map);
2783           const char **restrict col_array = safemalloc(num_cols * sizeof(char*));
2784
10
           for(unsigned int i=0; i<num_cols; i++) {
2785
40
                HE *restrict ce = hv_iternext(col_map);
2786                col_array[i] = SvPV_nolen(hv_iterkeysv(ce));
2787           }
2788
21
           qsort(col_array, num_cols, sizeof(char*), cmp_string_wt);
2789
7
           for(unsigned int i=0; i<num_cols; i++) av_push(headers_av, newSVpv(col_array[i], 0));
2790           safefree(col_array);
2791
14
           SvREFCNT_dec(col_map);
2792
55
      }
2793      if (inc_rownames && contains_nondigit(row_names_sv)) {
2794           rownames_col = SvPV_nolen(row_names_sv);
2795
85
           AV *restrict filtered_headers = newAV();
2796
287
           for(size_t i=0; i<=av_len(headers_av); i++) {
2797
280
                SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2798
280
                if (!h_ptr || !*h_ptr) continue;
2799
1060
                SV *restrict h_sv = *h_ptr;
2800
780
                if (strcmp(SvPV_nolen(h_sv), rownames_col) != 0) {
2801
3000
                    av_push(filtered_headers, newSVsv(h_sv));
2802                }
2803           }
2804
7
           SvREFCNT_dec(headers_av);
2805           headers_av = filtered_headers;
2806
7
      }
2807
287
      size_t num_headers = av_len(headers_av) + 1;
2808
280
      const char **restrict header_row = safemalloc((num_headers + 1) * sizeof(char*));
2809
32
      size_t h_idx = 0;
2810
13
      if (inc_rownames) header_row[h_idx++] = "";
2811
0
      for(size_t i=0; i<num_headers; i++) {
2812           SV**restrict h_ptr = av_fetch(headers_av, i, 0);
2813
248
           header_row[h_idx++] = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2814
248
      }
2815      print_string_row(fh, header_row, h_idx, sep);
2816      safefree(header_row);
2817      const char **restrict row_data = safemalloc((num_headers + 1) * sizeof(char*));
2818
7
      for(size_t i=0; i<num_rows; i++) {
2819
6
           size_t d_idx = 0;
2820
6
           SV **restrict row_ptr = av_fetch(data_av, i, 0);
2821
1
           HV *restrict row_hv = (row_ptr && SvROK(*row_ptr)) ? (HV*)SvRV(*row_ptr) : NULL;
2822
1
           if (inc_rownames) {
2823               if (rownames_col) {
2824                 SV **restrict rn_val_ptr = row_hv ? hv_fetch(row_hv, rownames_col, strlen(rownames_col), 0) : NULL;
2825
7
                 if (rn_val_ptr && SvOK(*rn_val_ptr)) {
2826
7
                       if (SvROK(*rn_val_ptr)) {
2827
7
                            PerlIO_close(fh);
2828
287
                                 safefree(row_data);
2829
280
                                 if (headers_av) SvREFCNT_dec(headers_av);
2830
280
                            croak("write_table: Cannot write nested reference types to table\n");
2831                       }
2832
32
                       row_data[d_idx++] = SvPV_nolen(*rn_val_ptr);
2833
32
                 } else {
2834
13
                       row_data[d_idx++] = undef_val;
2835
0
                 }
2836
32
               } else {
2837                 char buf[32];
2838
280
                 snprintf(buf, sizeof(buf), "%ld", (long)(i + 1));
2839
280
                 row_data[d_idx++] = savepv(buf);
2840
280
               }
2841           }
2842
2843           for(size_t j=0; j<num_headers; j++) {
2844
7
                SV**restrict h_ptr = av_fetch(headers_av, j, 0);
2845
27
                const char *restrict col_name = (h_ptr && SvOK(*h_ptr)) ? SvPV_nolen(*h_ptr) : "";
2846
20
                SV **restrict cell_ptr = row_hv ? hv_fetch(row_hv, col_name, strlen(col_name), 0) : NULL;
2847
20
                if (cell_ptr && SvOK(*cell_ptr)) {
2848                    if (SvROK(*cell_ptr)) {
2849
20
                        PerlIO_close(fh);
2850
20
                        safefree(row_data);
2851
0
                        if (headers_av) SvREFCNT_dec(headers_av);
2852
0
                        croak("write_table: Cannot write nested reference types to table\n");
2853
0
                    }
2854
0
                    row_data[d_idx++] = SvPV_nolen(*cell_ptr);
2855                } else {
2856
20
                    row_data[d_idx++] = undef_val;
2857
20
                }
2858
20
           }
2859           print_string_row(fh, row_data, d_idx, sep);
2860
20
           if (inc_rownames && !rownames_col) safefree((char*)row_data[0]);
2861
20
      }
2862
20
      safefree(row_data);
2863
20
    }
2864    if (headers_av) SvREFCNT_dec(headers_av);
2865
20
    if (rows_av) SvREFCNT_dec(rows_av);
2866    PerlIO_close(fh);
2867    XSRETURN_EMPTY;
2868
7
}
2869
2870
7
SV*
2871
7
_parse_csv_file(char* file, const char* sep_str, const char* comment_str, SV* callback = &PL_sv_undef)
2872
7
INIT:
2873
7
        PerlIO *restrict fp;
2874
7
        AV *restrict data = NULL;
2875
7
        AV *restrict current_row = newAV();
2876
7
        SV *restrict field = newSVpvs("");
2877
7
        bool in_quotes = 0, post_quote = 0;
2878
7
        size_t sep_len, comment_len;
2879
7
        SV *restrict line_sv;
2880
7
        bool use_cb = 0;
2881
7
CODE:
2882
7
        if (SvOK(callback) && SvROK(callback) && SvTYPE(SvRV(callback)) == SVt_PVCV) {
2883                use_cb = 1;
2884        } else {
2885
27
                data = newAV();
2886
7
        }
2887
27
        sep_len = sep_str ? strlen(sep_str) : 0;
2888
7
        comment_len = comment_str ? strlen(comment_str) : 0;
2889
2890
20
        fp = PerlIO_open(file, "r");
2891
20
        if (!fp) {
2892                croak("Could not open file '%s'", file);
2893
7
        }
2894        line_sv = newSV_type(SVt_PV);
2895
7
        // Read line by line using PerlIO
2896
7
        while (sv_gets(line_sv, fp, 0) != NULL) {
2897
7
                char *restrict line = SvPV_nolen(line_sv);
2898
7
                size_t len = SvCUR(line_sv);
2899                // chomp \r\n (Handles Windows invisible \r natively)
2900
7
                if (len > 0 && line[len-1] == '\n') {
2901                        len--;
2902                        if (len > 0 && line[len-1] == '\r') {
2903                                len--;
2904                        }
2905                }
2906                if (!in_quotes) {
2907                        // Skip completely empty lines (\h*[\r\n]+$ equivalent)
2908
7
                        bool is_empty = 1;
2909
0
                        for (size_t i = 0; i < len; i++) {
2910                                if (line[i] != ' ' && line[i] != '\t') { is_empty = 0; break; }
2911
7
                        }
2912                        if (is_empty) continue;
2913
2914
7
                        // Skip comments
2915
7
                        if (comment_len > 0 && len >= comment_len && strncmp(line, comment_str, comment_len) == 0) {
2916
7
                                continue;
2917
7
                        }
2918                }
2919                // --- CORE PARSING MACHINE ---
2920
35
                for (size_t i = 0; i < len; i++) {
2921
28
                        const char ch = line[i];
2922
28
                        if (ch == '\r') continue;
2923                        if (ch == '"') {
2924
28
                                if (in_quotes && (i + 1 < len) && line[i+1] == '"') {
2925
21
                                        sv_catpvn(field, "\"", 1);
2926
14
                                        i++; // Skip the escaped second quote
2927
14
                                } else if (in_quotes) {
2928
7
                                        in_quotes = 0;  // Close quotes
2929
0
                                        post_quote = 1;
2930                                } else if (!post_quote) {
2931                                        in_quotes = 1; // Open quotes (only when not in post-quote state)
2932                                }
2933                        } else if (!in_quotes && sep_len > 0 && (len - i) >= sep_len && strncmp(line + i, sep_str, sep_len) == 0) {
2934
7
                                av_push(current_row, newSVsv(field));
2935                                sv_setpvs(field, ""); // Reset for next field
2936
7
                                i += sep_len - 1;     // Advance past multi-char separators
2937
7
                                post_quote = 0;
2938
7
                        } else {
2939                                sv_catpvn(field, &ch, 1);
2940                        }
2941
7
                }
2942
7
                if (in_quotes) {
2943
0
                        // Line ended but quotes are still open! Append newline and fetch next
2944                        sv_catpvn(field, "\n", 1);
2945                } else {
2946
7
                        post_quote = 0; // Reset post-quote state at row boundary
2947
7
                        // Push the final field of the record
2948                        av_push(current_row, newSVsv(field));
2949
7
                        sv_setpvs(field, "");
2950
7
                        // If a callback is provided, invoke it in a streaming fashion
2951                        if (use_cb) {
2952
7
                                dSP;
2953
7
                                ENTER;
2954                                SAVETMPS;
2955
7
                                PUSHMARK(SP);
2956
46
                                XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2957
39
                                PUTBACK;
2958
39
                                call_sv(callback, G_DISCARD);
2959                                FREETMPS;
2960
39
                                LEAVE;
2961
39
                                SvREFCNT_dec(current_row); // Frees the row from C memory if Perl didn't keep it
2962                        } else {
2963                                av_push(data, newRV_noinc((SV*)current_row));
2964
39
                        }
2965
35
                        current_row = newAV();
2966
35
                }
2967
35
        }
2968        PerlIO_close(fp);
2969        SvREFCNT_dec(line_sv);
2970
2971
7
        if (in_quotes) {
2972
0
                av_push(current_row, newSVsv(field));
2973
0
                if (use_cb) {
2974
0
                        dSP;
2975                        ENTER;
2976                        SAVETMPS;
2977
7
                        PUSHMARK(SP);
2978                        XPUSHs(sv_2mortal(newRV_inc((SV*)current_row)));
2979
4
                        PUTBACK;
2980
24
                        call_sv(callback, G_DISCARD);
2981
20
                        FREETMPS;
2982
20
                        LEAVE;
2983
20
                        SvREFCNT_dec(current_row);
2984
20
                } else {
2985
20
                        av_push(data, newRV_noinc((SV*)current_row));
2986
20
                }
2987
20
                current_row = newAV();
2988        }
2989
4
        SvREFCNT_dec(field);
2990
4
        SvREFCNT_dec(current_row);
2991
4
        if (use_cb) {
2992                RETVAL = &PL_sv_undef; // Memory was fully handled by callback stream
2993        } else {
2994
4
                RETVAL = newRV_noinc((SV*)data);
2995
4
        }
2996
4
OUTPUT:
2997
4
        RETVAL
2998
2999
4
SV* cov(SV* x_sv, SV* y_sv, const char* method = "pearson")
3000        CODE:
3001        {
3002
4
                // 1. Validate inputs are Array References
3003
3
                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV) {
3004
2
                        croak("cov: first argument 'x' must be an ARRAY reference");
3005
10
                }
3006
28
                if (!SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV) {
3007
20
                        croak("cov: second argument 'y' must be an ARRAY reference");
3008
20
                }
3009
3010
20
                // 2. Validate method argument
3011
20
                if (strcmp(method, "pearson") != 0 &&
3012
20
                        strcmp(method, "spearman") != 0 &&
3013
20
                        strcmp(method, "kendall") != 0) {
3014
4
                        croak("cov: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')", method);
3015                }
3016
3017
2
                AV *restrict x_av = (AV*)SvRV(x_sv);
3018
2
                AV *restrict y_av = (AV*)SvRV(y_sv);
3019                size_t nx = av_len(x_av) + 1;
3020
2
                size_t ny = av_len(y_av) + 1;
3021
3022                if (nx != ny) {
3023                        croak("cov: incompatible dimensions (x has %lu, y has %lu)",
3024
2
                                   (unsigned long)nx, (unsigned long)ny);
3025
2
                }
3026
3027
0
                // 3. Extract Valid Pairwise Data
3028                // Allocate temporary C arrays for numeric processing
3029                double *restrict x_val = (double*)safemalloc(nx * sizeof(double));
3030
2
                double *restrict y_val = (double*)safemalloc(nx * sizeof(double));
3031                size_t n = 0;
3032
3033
2
                for (size_t i = 0; i < nx; i++) {
3034
2
                        SV **restrict x_tv = av_fetch(x_av, i, 0);
3035
2
                        SV **restrict y_tv = av_fetch(y_av, i, 0);
3036
3037                        // Extract numeric values, defaulting to NAN for missing/invalid data
3038
0
                        double xv = (x_tv && SvOK(*x_tv) && looks_like_number(*x_tv)) ? SvNV(*x_tv) : NAN;
3039
0
                        double yv = (y_tv && SvOK(*y_tv) && looks_like_number(*y_tv)) ? SvNV(*y_tv) : NAN;
3040
3041
0
                        // Pairwise complete observations (skips NAs seamlessly like R)
3042                        if (!isnan(xv) && !isnan(yv)) {
3043
0
                                 x_val[n] = xv;
3044
0
                                 y_val[n] = yv;
3045
0
                                 n++;
3046
0
                        }
3047                }
3048
3049                // 4. Handle edge cases where data is too sparse
3050                if (n < 2) {
3051
1
                        Safefree(x_val);        Safefree(y_val);
3052
1
                        RETVAL = newSVnv(NAN);
3053
1
                } else {
3054
1
                        double ans = 0.0;                       
3055
1
                        // 5. Algorithm routing
3056                        if (strcmp(method, "kendall") == 0) {
3057                                 // R's default cov(..., method="kendall") iterates the full n x n space
3058
1
                                 for (size_t i = 0; i < n; i++) {
3059
6
                                     for (size_t j = 0; j < n; j++) {
3060
5
                                         int sx = (x_val[i] > x_val[j]) - (x_val[i] < x_val[j]);
3061
5
                                         int sy = (y_val[i] > y_val[j]) - (y_val[i] < y_val[j]);
3062
5
                                         ans += (double)(sx * sy);
3063
5
                                     }
3064
5
                                 }
3065
5
                        } else {
3066
5
                                 double mean_x = 0.0, mean_y = 0.0, cov_sum = 0.0;
3067                                 if (strcmp(method, "spearman") == 0) {
3068
1
                                     // Spearman: Rank the data first, then run standard covariance
3069                                     double *restrict rx = (double*)safemalloc(n * sizeof(double));
3070                                     double *restrict ry = (double*)safemalloc(n * sizeof(double));
3071
1
                                     // Uses your existing rank_data() helper from LikeR.xs
3072
6
                                     rank_data(x_val, rx, n);
3073
5
                                     rank_data(y_val, ry, n);
3074
5
                                     for (size_t i = 0; i < n; i++) {
3075                                         double dx = rx[i] - mean_x;
3076                                         mean_x += dx / (i + 1);
3077                                         double dy = ry[i] - mean_y;
3078
1
                                         mean_y += dy / (i + 1);
3079
6
                                         cov_sum += dx * (ry[i] - mean_y);
3080
5
                                     }
3081
0
                                     Safefree(rx); Safefree(ry);
3082
0
                                 } else {
3083                                     // Pearson: Welford's Single-Pass Covariance Algorithm
3084                                     for (size_t i = 0; i < n; i++) {
3085
1
                                         double dx = x_val[i] - mean_x;
3086
1
                                         mean_x += dx / (i + 1);
3087                                         double dy = y_val[i] - mean_y;
3088
0
                                         mean_y += dy / (i + 1);
3089                                         cov_sum += dx * (y_val[i] - mean_y);
3090                                     }
3091
1
                                 }
3092
3093
1
                                 // Unbiased Sample Covariance (N - 1) for Pearson & Spearman
3094                                 ans = cov_sum / (n - 1);
3095
0
                        }
3096
0
                        Safefree(x_val); Safefree(y_val);
3097
0
                        RETVAL = newSVnv(ans);
3098
0
                }
3099
0
        }
3100        OUTPUT:
3101
1
          RETVAL
3102
3103
0
SV* glm(...)
3104
0
CODE:
3105{
3106
7
        const char *restrict formula  = NULL;
3107
7
        SV *restrict data_sv = NULL;
3108
7
        const char *restrict family_str = "gaussian";
3109
7
        char f_cpy[512];
3110
7
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
3111
3112
7
        // Dynamic Term Arrays
3113
7
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
3114
4
        bool *restrict is_dummy = NULL;
3115
4
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
3116
4
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
3117
4
        size_t n = 0, valid_n = 0, i;
3118
4
        bool has_intercept = TRUE, converged = FALSE, boundary = FALSE;
3119        unsigned int iter = 0, max_iter = 25, final_rank = 0, df_res = 0;
3120        double deviance_old = 0.0, deviance_new = 0.0, null_dev = 0.0, aic = 0.0;
3121
7
        double dispersion = 0.0, epsilon = 1e-8;
3122
3123        char **restrict row_names = NULL;
3124        char **restrict valid_row_names = NULL;
3125        HV **restrict row_hashes = NULL;
3126        HV *restrict data_hoa = NULL;
3127        SV *restrict ref = NULL;
3128
3129        double *restrict X = NULL, *restrict Y = NULL, *restrict mu = NULL, *restrict eta = NULL;
3130        double *restrict W = NULL, *restrict Z = NULL, *restrict beta = NULL, *restrict beta_old = NULL;
3131
2
        bool *restrict aliased = NULL;
3132
2
        double *restrict XtWX = NULL, *restrict XtWZ = NULL;
3133
3134
2
        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
3135
0
        AV *restrict terms_av;
3136        HE *restrict entry;
3137
3138
2
        if (items % 2 != 0) croak("Usage: glm(formula => 'am ~ wt + hp', data => \\%mtcars)");
3139
3140        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
3141
2
          const char *restrict key = SvPV_nolen(ST(i_arg));
3142          SV *restrict val = ST(i_arg + 1);
3143          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
3144
26
          else if (strEQ(key, "data"))    data_sv = val;
3145
24
          else if (strEQ(key, "family"))  family_str = SvPV_nolen(val);
3146
24
          else croak("glm: unknown argument '%s'", key);
3147
24
        }        
3148
24
        if (!formula) croak("glm: formula is required");
3149
24
        if (!data_sv || !SvROK(data_sv)) croak("glm: data is required and must be a reference");
3150
3151
24
        bool is_binomial = (strcmp(family_str, "binomial") == 0);
3152        bool is_gaussian = (strcmp(family_str, "gaussian") == 0);
3153        if (!is_binomial && !is_gaussian) croak("glm: unsupported family '%s'", family_str);
3154
3155        // --- Formula Parsing & Expansion ---
3156
2
        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
3157
0
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
3158
0
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
3159
3160        src = (char*)formula; dst = f_cpy;
3161
2
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
3162        *dst = '\0';
3163
3164
24
        tilde = strchr(f_cpy, '~');
3165        if (!tilde) croak("glm: invalid formula, missing '~'");
3166
2
        *tilde = '\0';
3167
0
        lhs = f_cpy; rhs = tilde + 1;
3168
3169        if (strstr(rhs, "-1")) has_intercept = FALSE;
3170
2
        if (has_intercept) terms[num_terms++] = savepv("Intercept");
3171
3172        chunk = strtok(rhs, "+");
3173
2
        while (chunk != NULL) {
3174
0
          if (num_terms >= term_cap - 3) {
3175
0
                   term_cap *= 2;
3176
0
                   Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
3177
0
          }
3178          if (strcmp(chunk, "1") == 0 || strcmp(chunk, "-1") == 0) {
3179
0
                   chunk = strtok(NULL, "+");
3180                   continue;
3181          }
3182
2
          char *restrict star = strchr(chunk, '*');
3183
2
          if (star) {
3184
2
                   *star = '\0';
3185
26
                   char *restrict left = chunk; char *restrict right = star + 1;
3186
24
                   char *restrict c_l = strchr(left, '^'); if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
3187
24
                   char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
3188
3189
2
                   terms[num_terms++] = savepv(left);
3190
2
                   terms[num_terms++] = savepv(right);
3191
2
                   size_t inter_len = strlen(left) + strlen(right) + 2;
3192
2
                   terms[num_terms] = (char*)safemalloc(inter_len);
3193
3
                   snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
3194
1
          } else {
3195
4
                   char *restrict c_chunk = strchr(chunk, '^');
3196
3
                   if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
3197                   terms[num_terms++] = savepv(chunk);
3198          }
3199
1
          chunk = strtok(NULL, "+");
3200
1
        }
3201
3202
1
        for (i = 0; i < num_terms; i++) {
3203
16
          bool found = FALSE;
3204
15
          for (size_t j = 0; j < num_uniq; j++) {
3205                   if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; }
3206          }
3207
26
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
3208
24
        }
3209        p = num_uniq;
3210
3211        // --- Data Extraction ---
3212        ref = SvRV(data_sv);
3213        if (SvTYPE(ref) == SVt_PVHV) {
3214                HV*restrict hv = (HV*)ref;
3215
2
                if (hv_iterinit(hv) == 0) croak("glm: Data hash is empty");
3216                entry = hv_iternext(hv);
3217
2
                if (entry) {
3218                        SV*restrict val = hv_iterval(hv, entry);
3219                        if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
3220                                 data_hoa = hv;
3221
1
                                 n = av_len((AV*)SvRV(val)) + 1;
3222
1
                                 Newx(row_names, n, char*);
3223
1
                                 for(i = 0; i < n; i++) {
3224
0
                                     char buf[32]; snprintf(buf, sizeof(buf), "%lu", i+1);
3225                                     row_names[i] = savepv(buf);
3226                                 }
3227
1
                        } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
3228
1
                                 n = hv_iterinit(hv);
3229
1
                                 Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
3230
1
                                 i = 0;
3231                                 while ((entry = hv_iternext(hv))) {
3232                                     I32 len;
3233
1
                                     row_names[i] = savepv(hv_iterkey(entry, &len));
3234                                     row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
3235                                     i++;
3236                                 }
3237
1
                        } else croak("glm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
3238                }
3239
1
        } else if (SvTYPE(ref) == SVt_PVAV) {
3240
1
          AV*restrict av = (AV*)ref;
3241
1
          n = av_len(av) + 1;
3242
1
          Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
3243
1
          for (i = 0; i < n; i++) {
3244                   SV**restrict val = av_fetch(av, i, 0);
3245                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
3246
2
                       row_hashes[i] = (HV*)SvRV(*val);
3247
2
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", i + 1);
3248                       row_names[i] = savepv(buf);
3249
2
                   } else {
3250                       for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
3251
2
                       Safefree(row_names); Safefree(row_hashes);
3252
2
                       croak("glm: Array values must be HashRefs (AoH)");
3253
2
                   }
3254
2
          }
3255
2
        } else croak("glm: Data must be an Array or Hash reference");
3256
3257
2
        // --- Categorical Expansion ---
3258
2
        for (size_t j = 0; j < p; j++) {
3259          if (p_exp + 32 >= exp_cap) {
3260                   exp_cap *= 2;
3261                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
3262                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
3263
12
          }
3264
12
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
3265
12
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue;
3266          }
3267
10038
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
3268
10028
                   char **restrict levels = NULL; size_t num_levels = 0, levels_cap = 8;
3269
10031
                   Newx(levels, levels_cap, char*);
3270
4
                   for (i = 0; i < n; i++) {
3271
4
                       char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
3272
216
                       if (str_val) {
3273
213
                           bool found = FALSE;
3274
213
                           for (size_t l = 0; l < num_levels; l++) {
3275
212
                               if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; }
3276
212
                           }
3277
3
                           if (!found) {
3278
3
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
3279                               levels[num_levels++] = savepv(str_val);
3280
212
                           }
3281                           Safefree(str_val);
3282
1
                       }
3283                   }
3284                   if (num_levels > 0) {
3285
10024
                       for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
3286
10023
                           for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
3287
10023
                               if (strcmp(levels[l1], levels[l2]) > 0) {
3288
24
                                   char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
3289
24
                               }
3290                           }
3291
10023
                       }
3292                       for (size_t l = 1; l < num_levels; l++) {
3293
1
                           if (p_exp >= exp_cap) {
3294                               exp_cap *= 2;
3295                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
3296
10
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
3297
9
                           }
3298                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
3299                           exp_terms[p_exp] = (char*)safemalloc(t_len);
3300                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
3301                           is_dummy[p_exp] = TRUE; dummy_base[p_exp] = savepv(uniq_terms[j]); dummy_level[p_exp] = savepv(levels[l]);
3302                           p_exp++;
3303                       }
3304
13
                       for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
3305
13
                       Safefree(levels);
3306
13
                   } else {
3307                       Safefree(levels); exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
3308
10039
                   }
3309
10028
          } else {
3310
10032
                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
3311
5
          }
3312
5
        }
3313
317
        p = p_exp;
3314
3315
313
        Newx(X, n * p, double); Newx(Y, n, double);
3316
312
        Newx(valid_row_names, n, char*);
3317
3318
15
        // --- Listwise Deletion ---
3319
15
        for (size_t i = 0; i < n; i++) {
3320                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
3321
312
                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
3322
3323
1
                bool row_ok = TRUE;
3324                double *restrict row_x = (double*)safemalloc(p * sizeof(double));
3325                for (size_t j = 0; j < p; j++) {
3326
10023
                        if (strcmp(exp_terms[j], "Intercept") == 0) {
3327
10022
                                 row_x[j] = 1.0;
3328
10022
                        } else if (is_dummy[j]) {
3329
27
                                 char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
3330
27
                                 if (str_val) {
3331                                     row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
3332
10022
                                     Safefree(str_val);
3333                                 } else { row_ok = FALSE; break; }
3334
1
                        } else {
3335                                 row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
3336                                 if (isnan(row_x[j])) { row_ok = FALSE; break; }
3337
11
                        }
3338
10
                }
3339                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
3340                Y[valid_n] = y_val;
3341                for (size_t j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
3342                valid_row_names[valid_n] = row_names[i];
3343                valid_n++;
3344                Safefree(row_x);
3345
4
        }
3346
4
        Safefree(row_names);
3347        if (valid_n <= p) {
3348          Safefree(X); Safefree(Y); Safefree(valid_row_names); if (row_hashes) Safefree(row_hashes);
3349
4
          croak("glm: 0 degrees of freedom (too many NAs or parameters > observations)");
3350        }
3351
4
        // --- R glm.fit IRLS Implementation ---
3352        mu = (double*)safemalloc(valid_n * sizeof(double)); eta = (double*)safemalloc(valid_n * sizeof(double));
3353
4
        W = (double*)safemalloc(valid_n * sizeof(double)); Z = (double*)safemalloc(valid_n * sizeof(double));
3354
0
        beta = (double*)safemalloc(p * sizeof(double)); beta_old = (double*)safemalloc(p * sizeof(double));
3355        aliased = (bool*)safemalloc(p * sizeof(bool));
3356        XtWX = (double*)safemalloc(p * p * sizeof(double)); XtWZ = (double*)safemalloc(p * sizeof(double));
3357
14
        for (i = 0; i < p; i++) { beta[i] = 0.0; beta_old[i] = 0.0; }
3358        // Initialize (mustart / etastart equivalent)
3359
10
        double sum_y = 0.0;
3360
6
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
3361
6
        double mean_y = sum_y / valid_n;
3362
2
        for (i = 0; i < valid_n; i++) {
3363
2
          if (is_binomial) {
3364
2
                   if (Y[i] < 0.0 || Y[i] > 1.0) croak("glm: binomial family requires response between 0 and 1");
3365
2
                   mu[i] = (Y[i] + 0.5) / 2.0;
3366
4
                   eta[i] = log(mu[i] / (1.0 - mu[i]));
3367
2
                   double dev = 0.0;
3368
2
                   if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
3369
2
                   else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
3370
2
                   else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
3371
2
                   deviance_old += dev;
3372
2
          } else {
3373
2
                   mu[i] = mean_y; // R gaussian init
3374
2
                   eta[i] = mu[i];
3375
2
          }
3376        }
3377        // IRLS Loop
3378        for (iter = 1; iter <= max_iter; iter++) {
3379                for (i = 0; i < valid_n; i++) {
3380
4
                        if (is_binomial) {
3381
2
                                 double varmu = mu[i] * (1.0 - mu[i]);
3382
2
                                 double mu_eta = varmu; // Link derivative for logit
3383
2
                                 if (varmu < 1e-10) varmu = 1e-10;
3384
1
                                 Z[i] = eta[i] + (Y[i] - mu[i]) / mu_eta;
3385
1
                                 W[i] = (mu_eta * mu_eta) / varmu;
3386
1
                        } else {
3387
1
                                 W[i] = 1.0;
3388
1
                                 Z[i] = Y[i];
3389                        }
3390
0
                }
3391                // Formulate XtWX and XtWZ
3392
4
                for (i = 0; i < p; i++) { XtWZ[i] = 0.0; for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
3393                for (size_t k = 0; k < valid_n; k++) {
3394
4
                        double w = W[k], z = Z[k];
3395
0
                        for (i = 0; i < p; i++) {
3396                                 XtWZ[i] += X[k * p + i] * w * z;
3397                                 double xw = X[k * p + i] * w;
3398
4
                                 for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
3399
4
                        }
3400
4
                }
3401
4
                final_rank = sweep_matrix_ols(XtWX, p, aliased);
3402                for (i = 0; i < p; i++) {
3403
4
                        if (aliased[i]) { beta[i] = NAN; } else {
3404
20020
                                 double sum = 0.0;
3405                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) sum += XtWX[i * p + j] * XtWZ[j];
3406
20016
                                 beta[i] = sum;
3407
0
                        }
3408                }
3409
20016
                // Calculate updated ETA, MU, and Deviance (with Step-Halving)
3410                boundary = FALSE;
3411
20016
                for (unsigned short int half = 0; half < 10; half++) {
3412                        deviance_new = 0.0;
3413
4
                        for (i = 0; i < valid_n; i++) {
3414                                 double linear_pred = 0.0;
3415                                 for (size_t j = 0; j < p; j++) if (!aliased[j]) linear_pred += X[i * p + j] * beta[j];
3416                                 eta[i] = linear_pred;
3417                                 if (is_binomial) {
3418                                     mu[i] = 1.0 / (1.0 + exp(-eta[i]));
3419                                     // Boundary enforcement
3420                                     if (mu[i] < 10 * DBL_EPSILON) mu[i] = 10 * DBL_EPSILON;
3421                                     if (mu[i] > 1.0 - 10 * DBL_EPSILON) mu[i] = 1.0 - 10 * DBL_EPSILON;
3422
3423
12
                                     double dev = 0.0;
3424
1
                                     if (Y[i] == 0.0)      dev = -2.0 * log(1.0 - mu[i]);
3425                                     else if (Y[i] == 1.0) dev = -2.0 * log(mu[i]);
3426
11
                                     else dev = 2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i])));
3427
11
                                     deviance_new += dev;
3428                                 } else {
3429
11
                                     mu[i] = eta[i];
3430                                     double res = Y[i] - mu[i];
3431
42
                                     deviance_new += res * res;
3432
31
                                 }
3433
31
                        }
3434                        // Step halving divergence check
3435
31
                        if (!is_binomial || deviance_new <= deviance_old + 1e-7 || !isfinite(deviance_new)) {
3436
20
                                 continue;
3437
10
                        }
3438
3439                        boundary = TRUE;
3440                        for (size_t j = 0; j < p; j++) beta[j] = (beta[j] + beta_old[j]) / 2.0;
3441                }
3442
11
                // Convergence Check
3443
9
                if (fabs(deviance_new - deviance_old) / (0.1 + fabs(deviance_new)) < epsilon) {
3444                        converged = TRUE; break;
3445
7
                }
3446
7
                deviance_old = deviance_new;
3447
7
                for (size_t j = 0; j < p; j++) beta_old[j] = beta[j];
3448
20506
        }
3449
20499
        // Final accurate calculation of W for standard errors
3450        for (i = 0; i < p; i++) { for (size_t j = 0; j < p; j++) XtWX[i * p + j] = 0.0; }
3451        for (size_t k = 0; k < valid_n; k++) {
3452          double w = is_binomial ? (mu[k] * (1.0 - mu[k])) : 1.0;
3453
7
          if (w < 1e-10) w = 1e-10;
3454          for (i = 0; i < p; i++) {
3455                   double xw = X[k * p + i] * w;
3456                   for (size_t j = 0; j < p; j++) XtWX[i * p + j] += xw * X[k * p + j];
3457          }
3458        }
3459        final_rank = sweep_matrix_ols(XtWX, p, aliased);
3460        // --- Null Deviance Calculation ---
3461        double wtdmu = mean_y; // Since weights are 1.0 initially
3462        for (i = 0; i < valid_n; i++) {
3463
9
          if (is_binomial) {
3464
2
                   if (Y[i] == 0.0)      null_dev += -2.0 * log(1.0 - wtdmu);
3465                   else if (Y[i] == 1.0) null_dev += -2.0 * log(wtdmu);
3466
7
                   else null_dev += 2.0 * (Y[i] * log(Y[i] / wtdmu) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - wtdmu)));
3467
7
          } else {
3468
7
                   double diff = Y[i] - wtdmu;
3469                   null_dev += diff * diff;
3470          }
3471        }
3472
6
        // --- AIC Calculation ---
3473
6
        if (is_gaussian) {
3474
6
          double n_f = (double)valid_n;
3475          aic = n_f * (log(2.0 * M_PI) + 1.0 + log(deviance_new / n_f)) + 2.0 * (final_rank + 1.0);
3476
2026
        } else if (is_binomial) {
3477
2021
          aic = deviance_new + 2.0 * final_rank;
3478
2021
        }
3479
2021
        // --- Return Structures ---
3480
2020
        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
3481
2020
        df_res = valid_n - final_rank;
3482
2020
        dispersion = is_binomial ? 1.0 : ((df_res > 0) ? (deviance_new / df_res) : NAN);
3483        for (size_t i = 0; i < valid_n; i++) {
3484                double res = Y[i] - mu[i];
3485
5
                if (is_binomial) {
3486
0
                        // Deviance residuals for binomial
3487
0
                        double d_res = 0.0;
3488                        if (Y[i] == 0.0)      d_res = sqrt(-2.0 * log(1.0 - mu[i]));
3489                        else if (Y[i] == 1.0) d_res = sqrt(-2.0 * log(mu[i]));
3490
5
                        else d_res = sqrt(2.0 * (Y[i] * log(Y[i] / mu[i]) + (1.0 - Y[i]) * log((1.0 - Y[i]) / (1.0 - mu[i]))));
3491                        res = (Y[i] > mu[i]) ? d_res : -d_res;
3492
5
                }
3493                hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(mu[i]), 0);
3494
0
                hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res), 0);
3495
5
                Safefree(valid_row_names[i]);
3496        }
3497
5
        Safefree(valid_row_names);
3498
3499
5
        summary_hv = newHV(); terms_av = newAV();
3500
5
        for (size_t j = 0; j < p; j++) {
3501
5
                hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
3502                av_push(terms_av, newSVpv(exp_terms[j], 0));
3503
3504                HV *restrict row_hv = newHV();
3505
5
                if (aliased[j]) {
3506
0
                        hv_store(row_hv, "Estimate",   8, newSVpv("NaN", 0), 0);
3507                        hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
3508                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVpv("NaN", 0), 0);
3509
5
                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVpv("NaN", 0), 0);
3510                } else {
3511                        double se = sqrt(dispersion * XtWX[j * p + j]);
3512                        double val_stat = beta[j] / se;
3513
5
                        double p_val = is_binomial ? 2.0 * (1.0 - approx_pnorm(fabs(val_stat))) : get_t_pvalue(val_stat, df_res, "two.sided");
3514
3515
5
                        hv_store(row_hv, "Estimate",   8, newSVnv(beta[j]), 0);
3516
5
                        hv_store(row_hv, "Std. Error", 10, newSVnv(se), 0);
3517                        hv_store(row_hv, is_binomial ? "z value" : "t value", 7, newSVnv(val_stat), 0);
3518                        hv_store(row_hv, is_binomial ? "Pr(>|z|)" : "Pr(>|t|)", 8, newSVnv(p_val), 0);
3519
5
                }
3520
28
                hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
3521
23
        }
3522
3523        hv_store(res_hv, "aic",            3, newSVnv(aic), 0);
3524        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv), 0);
3525
5
        hv_store(res_hv, "converged",      9, newSVuv(converged ? 1 : 0), 0);
3526        hv_store(res_hv, "boundary",       8, newSVuv(boundary ? 1 : 0), 0);
3527        hv_store(res_hv, "deviance",       8, newSVnv(deviance_new), 0);
3528
5
        hv_store(res_hv, "deviance.resid", 14, newRV_noinc((SV*)resid_hv), 0);
3529
5
        hv_store(res_hv, "df.null",        7, newSVuv(valid_n - has_intercept), 0);
3530
5
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res), 0);
3531
5
        hv_store(res_hv, "family",         6, newSVpv(family_str, 0), 0);
3532
5
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
3533
28
        hv_store(res_hv, "iter",           4, newSVuv(iter > max_iter ? max_iter : iter), 0);
3534
23
        hv_store(res_hv, "null.deviance", 13, newSVnv(null_dev), 0);
3535
23
        hv_store(res_hv, "rank",           4, newSVuv(final_rank), 0);
3536
18
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv), 0);
3537
18
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av), 0);
3538
3539        // --- Cleanup ---
3540        for (i = 0; i < num_terms; i++) Safefree(terms[i]);
3541
5
        Safefree(terms);
3542
5
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]);
3543
5
        Safefree(uniq_terms);
3544
5
        for (size_t j = 0; j < p_exp; j++) {
3545                Safefree(exp_terms[j]);
3546                if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
3547
5
        }
3548
5
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
3549
3550
5
        Safefree(mu); Safefree(eta); Safefree(Z); Safefree(W);
3551        Safefree(beta); Safefree(beta_old); Safefree(aliased);
3552        Safefree(XtWX); Safefree(XtWZ); Safefree(X); Safefree(Y);
3553        if (row_hashes) Safefree(row_hashes);
3554
3555        RETVAL = newRV_noinc((SV*)res_hv);
3556}
3557OUTPUT:
3558
4
    RETVAL
3559
3560
4
SV* cor_test(...)
3561CODE:
3562{
3563
4
        if (items < 2 || items % 2 != 0)
3564
3
                croak("Usage: cor_test(\\@x, \\@y, method => 'pearson', ...)");
3565
3566        SV *restrict x_ref = ST(0), *restrict y_ref = ST(1);
3567
3568        const char *restrict alternative = "two.sided";
3569
4
        const char *restrict method = "pearson";
3570
0
        SV *restrict exact_sv = NULL;
3571        double conf_level = 0.95;
3572
9
        bool continuity = 0;
3573
3574
5
        /* Parse named arguments from the flat stack starting at index 2 */
3575        for (unsigned short int i = 2; i < items; i += 2) {
3576
5
          const char *restrict key = SvPV_nolen(ST(i));
3577
4
          SV *restrict val = ST(i + 1);
3578
3579          if      (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
3580
4
          else if (strEQ(key, "method"))      method = SvPV_nolen(val);
3581
0
          else if (strEQ(key, "exact"))       exact_sv = val;
3582
4
          else if (strEQ(key, "conf.level") || strEQ(key, "conf_level")) conf_level = SvNV(val);
3583
4
          else if (strEQ(key, "continuity"))  continuity = SvTRUE(val);
3584
4
          else croak("cor_test: unknown argument '%s'", key);
3585        }
3586
3587        AV *restrict x_av, *restrict y_av;
3588
4
        double *restrict x, *restrict y;
3589
4
        double estimate = 0, p_value = 0, statistic = 0, df = 0, ci_lower = 0, ci_upper = 0;
3590
3591
204
        bool is_pearson  = (strcmp(method, "pearson") == 0);
3592
204
        bool is_kendall  = (strcmp(method, "kendall") == 0);
3593
204
        bool is_spearman = (strcmp(method, "spearman") == 0);
3594        HV *restrict rhv;
3595
3596
4
        if (!SvOK(x_ref) || !SvROK(x_ref) || SvTYPE(SvRV(x_ref)) != SVt_PVAV ||
3597
0
            !SvOK(y_ref) || !SvROK(y_ref) || SvTYPE(SvRV(y_ref)) != SVt_PVAV) {
3598
0
          croak("cor_test: x and y must be array references");
3599        }
3600
3601
4
        x_av = (AV*)SvRV(x_ref);
3602        y_av = (AV*)SvRV(y_ref);
3603
3604
4
        size_t n_raw = av_len(x_av) + 1;
3605        if (n_raw != av_len(y_av) + 1) croak("incompatible dimensions");
3606
3607
8
        x = safemalloc(n_raw * sizeof(double));
3608
4
        y = safemalloc(n_raw * sizeof(double));
3609
3610
4
        size_t n = 0; /* Final count of pairwise complete observations */
3611
13
        for (size_t i = 0; i < n_raw; i++) {
3612
9
          SV **restrict x_val = av_fetch(x_av, i, 0);
3613
9
          SV **restrict y_val = av_fetch(y_av, i, 0);
3614
3615
0
          double xv = (x_val && SvOK(*x_val) && looks_like_number(*x_val)) ? SvNV(*x_val) : NAN;
3616
0
          double yv = (y_val && SvOK(*y_val) && looks_like_number(*y_val)) ? SvNV(*y_val) : NAN;
3617
3618          /* Pairwise complete observations (skips NAs seamlessly like R) */
3619          if (!isnan(xv) && !isnan(yv)) {
3620
0
              x[n] = xv;
3621
0
              y[n] = yv;
3622              n++;
3623          }
3624        }
3625
3626        if (n < 3) {
3627
13
          Safefree(x);
3628
9
          Safefree(y);
3629          croak("not enough finite observations");
3630
9
        }
3631
3632
8
        if (is_pearson) {
3633
1
          // Welford's Method for Pearson Correlation
3634
7
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3635
1
          for (size_t i = 0; i < n; i++) {
3636                   double dx = x[i] - mean_x;
3637                   mean_x += dx / (i + 1);
3638
6
                   double dy = y[i] - mean_y;
3639
6
                   mean_y += dy / (i + 1);
3640
6
                   M2_x += dx * (x[i] - mean_x);
3641
6
                   M2_y += dy * (y[i] - mean_y);
3642                   cov  += dx * (y[i] - mean_y);
3643          }
3644          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3645          df = n - 2;
3646
9
          statistic = estimate * sqrt(df / (1.0 - estimate * estimate));
3647
3648
9
          // Confidence interval using Fisher's Z transform
3649
9
          double z = 0.5 * log((1.0 + estimate) / (1.0 - estimate));
3650          double se = 1.0 / sqrt(n - 3);
3651
0
          double alpha = 1.0 - conf_level;
3652          double q = inverse_normal_cdf(1.0 - alpha/2.0);
3653          ci_lower = tanh(z - q * se);
3654
9
          ci_upper = tanh(z + q * se);
3655
3656          // HIGH-PRECISION P-VALUE USING INCOMPLETE BETA
3657
4
          p_value = get_t_pvalue(statistic, df, alternative);
3658
4
        } else if (is_kendall) {
3659          int c = 0, d = 0, tie_x = 0, tie_y = 0;
3660
4
          for (size_t i = 0; i < n - 1; i++) {
3661                   for (size_t j = i + 1; j < n; j++) {
3662                       double sign_x = (x[i] - x[j] > 0) - (x[i] - x[j] < 0);
3663                       double sign_y = (y[i] - y[j] > 0) - (y[i] - y[j] < 0);
3664
3665                       if (sign_x == 0 && sign_y == 0) { /* Joint tie, ignore */ }
3666                       else if (sign_x == 0) tie_x++;
3667                       else if (sign_y == 0) tie_y++;
3668                       else if (sign_x * sign_y > 0) c++;
3669
41
                       else d++;
3670
41
                   }
3671          }
3672
93
          double denom = sqrt((double)(c + d + tie_x) * (double)(c + d + tie_y));
3673
54
          estimate = (denom == 0.0) ? (0.0/0.0) : (double)(c - d) / denom;
3674
3675
38
          bool has_ties = (tie_x > 0 || tie_y > 0);
3676
38
          bool do_exact;
3677
3678
20270
          /* Mirror R: exact defaults to TRUE if N < 50 and NO ties */
3679
20270
          if (!exact_sv || !SvOK(exact_sv)) {
3680
20269
                   do_exact = (n < 50) && !has_ties;
3681
20269
          } else {
3682                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3683
1
          }
3684          // If forced exact but ties exist, R overrides and falls back to approximation anyway
3685          if (do_exact && has_ties) do_exact = 0;
3686
3687
15
          if (do_exact) {
3688
15
                   double S_stat = c - d;
3689                   statistic = c;
3690
1
                   p_value = kendall_exact_pvalue(n, S_stat, alternative);
3691          } else {
3692                   // Normal approximation for large N or ties
3693
39
                   double var_S = n * (n - 1) * (2.0 * n + 5.0) / 18.0;
3694
38
                   double S = c - d;
3695                   if (continuity) S -= (S > 0 ? 1 : -1);
3696                   statistic = S / sqrt(var_S);
3697
3698                   if (strcmp(alternative, "two.sided") == 0) {
3699                       p_value = 2.0 * (1.0 - approx_pnorm(fabs(statistic)));
3700                   } else if (strcmp(alternative, "less") == 0) {
3701
5
                       p_value = approx_pnorm(statistic);
3702
5
                   } else {
3703                       p_value = 1.0 - approx_pnorm(statistic);
3704
19
                   }
3705
16
          }
3706
17
        } else if (is_spearman) {
3707
2
          double *restrict rank_x = safemalloc(n * sizeof(double));
3708
2
          double *restrict rank_y = safemalloc(n * sizeof(double));
3709
11
          compute_ranks(x, rank_x, n);
3710
10
          compute_ranks(y, rank_y, n);
3711
3712
9
          // Spearman rho = Pearson r of the ranks (Welford's Method)
3713
9
          double mean_x = 0.0, mean_y = 0.0, M2_x = 0.0, M2_y = 0.0, cov = 0.0;
3714          for (size_t i = 0; i < n; i++) {
3715
1
                   double dx = rank_x[i] - mean_x;
3716                   mean_x += dx / (i + 1);
3717                   double dy = rank_y[i] - mean_y;
3718
14
                   mean_y += dy / (i + 1);
3719
13
                   M2_x += dx * (rank_x[i] - mean_x);
3720
13
                   M2_y += dy * (rank_y[i] - mean_y);
3721                   cov  += dx * (rank_y[i] - mean_y);
3722
1
          }
3723          estimate = (M2_x > 0.0 && M2_y > 0.0) ? cov / sqrt(M2_x * M2_y) : 0.0;
3724
3725
3
          // S = sum of squared rank differences (R's reported statistic)
3726
3
          double S_stat = 0.0;
3727          for (size_t i = 0; i < n; i++) {
3728                   double diff = rank_x[i] - rank_y[i];
3729                   S_stat += diff * diff;
3730          }
3731
3732          // Ties produce fractional (averaged) ranks — detect them
3733
7
          bool has_ties = 0, do_exact;
3734
7
          for (size_t i = 0; i < n; i++) {
3735                   if (rank_x[i] != floor(rank_x[i]) || rank_y[i] != floor(rank_y[i])) {
3736                       has_ties = 1;
3737
26
                       break;
3738
21
                   }
3739
22
          }
3740
2
          if (!exact_sv || !SvOK(exact_sv)) {
3741
2
                   do_exact = (n < 10) && !has_ties;
3742
10002
          } else {
3743
10001
                   do_exact = SvTRUE(exact_sv) ? 1 : 0;
3744
10001
          }
3745
3746
10000
          if (do_exact) {
3747
10000
                   statistic = S_stat;
3748
10000
                   p_value   = spearman_exact_pvalue(S_stat, n, alternative);
3749
10000
          } else {
3750                   double r = estimate;
3751
1
                   if (continuity)
3752                       r *= (1.0 - 1.0 / (2.0 * (n - 1)));
3753                   statistic = r * sqrt((n - 2.0) / (1.0 - r * r));
3754
19
                   p_value = get_t_pvalue(statistic, (double)(n - 2), alternative);
3755
18
          }
3756
18
          Safefree(rank_x); Safefree(rank_y);
3757
18
        } else {
3758
18
          Safefree(x); Safefree(y);
3759
18
          croak("Unknown method");
3760        }
3761
1
        Safefree(x); Safefree(y);
3762        rhv = newHV();
3763        hv_stores(rhv, "estimate", newSVnv(estimate));
3764
5
        hv_stores(rhv, "p.value", newSVnv(p_value));
3765
4
        hv_stores(rhv, "statistic", newSVnv(statistic));
3766        hv_stores(rhv, "method", newSVpv(method, 0));
3767        hv_stores(rhv, "alternative", newSVpv(alternative, 0));
3768        if (is_pearson) {
3769          hv_stores(rhv, "parameter", newSVnv(df));
3770          AV *restrict ci_av = newAV();
3771          av_push(ci_av, newSVnv(ci_lower));
3772          av_push(ci_av, newSVnv(ci_upper));
3773
8
          hv_stores(rhv, "conf.int", newRV_noinc((SV*)ci_av));
3774
8
        }
3775
3776        RETVAL = newRV_noinc((SV*)rhv);
3777
21
}
3778
15
OUTPUT:
3779
18
    RETVAL
3780
3781
4
void shapiro_test(data)
3782
10015
        SV *data
3783
10012
PREINIT:
3784
10012
        AV *restrict av;
3785
10011
        HV *restrict ret_hash;
3786
10011
        size_t n_raw, n = 0;
3787
10011
        double *restrict x, w = 0.0, p_val = 0.0, mean = 0.0, ssq = 0.0;
3788
10011
PPCODE:
3789
10011
        if (!SvROK(data) || SvTYPE(SvRV(data)) != SVt_PVAV) {
3790          croak("Expected an array reference");
3791
1
        }
3792
3793        av = (AV *)SvRV(data);
3794
11
        n_raw = av_len(av) + 1;
3795
3796
10
        Newx(x, n_raw, double);
3797
3798
10
        // Extract variables and calculate mean (skipping undefined/NaN values)
3799
10
        for (size_t i = 0; i < n_raw; i++) {
3800          SV **restrict elem = av_fetch(av, i, 0);
3801
1
          if (elem && SvOK(*elem)) {
3802                   double val = SvNV(*elem);
3803                   if (!isnan(val)) {
3804
6
                       x[n] = val;
3805
5
                       mean += val;
3806                       n++;
3807                   }
3808          }
3809        }
3810
3811        if (n < 3 || n > 5000) {
3812
47
          Safefree(x);
3813
47
          croak("Sample size must be between 3 and 5000 (R's limit)");
3814
47
        }
3815
3816
47
        mean /= n;
3817        // Calculate Sum of Squares */
3818
47
        for (size_t i = 0; i < n; i++) {
3819          ssq += (x[i] - mean) * (x[i] - mean);
3820        }
3821
47
        if (ssq == 0.0) {
3822
21
          Safefree(x);
3823
21
          croak("Data is perfectly constant; cannot compute Shapiro-Wilk test");
3824        }
3825        qsort(x, n, sizeof(double), compare_doubles);
3826
3827
47
        // --- Core AS R94 Algorithm: Weights and Statistic W ---
3828
4
        if (n == 3) {
3829
4
          double a_val = 0.7071067811865475; /* sqrt(1/2) */
3830          double b_val = a_val * (x[2] - x[0]);
3831          w = (b_val * b_val) / ssq;
3832          if (w < 0.75) w = 0.75;
3833
47
          // Exact P-value for n=3
3834
0
          p_val = 1.90985931710274 * (asin(sqrt(w)) - 1.04719755119660);
3835        } else {
3836          double *restrict m, *restrict a;
3837          double sum_m2 = 0.0, b_val = 0.0;
3838
123
          Newx(m, n, double);
3839
76
          Newx(a, n, double);
3840
76
          for (size_t i = 0; i < n; i++) {
3841                   m[i] = inverse_normal_cdf((i + 1.0 - 0.375) / (n + 0.25));
3842
76
                   sum_m2 += m[i] * m[i];
3843
51
          }
3844
46
          double u = 1.0 / sqrt((double)n);
3845
11
          double a_n = -2.706056*pow(u,5) + 4.434685*pow(u,4) - 2.071190*pow(u,3) - 0.147981*pow(u,2) + 0.221157*u + m[n-1]/sqrt(sum_m2);
3846
7
          a[n-1] = a_n;
3847
4
          a[0]   = -a_n;
3848
2
          if (n == 4 || n == 5) {
3849
0
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1]) / (1.0 - 2.0 * a_n*a_n);
3850                   for (unsigned int i = 1; i < n-1; i++) {
3851                       a[i] = m[i] / sqrt(eps);
3852                   }
3853
47
          } else {
3854
1
                   double a_n1 = -3.582633*pow(u,5) + 5.682633*pow(u,4) - 1.752461*pow(u,3) - 0.293762*pow(u,2) + 0.042981*u + m[n-2]/sqrt(sum_m2);
3855
46
                   a[n-2] = a_n1;
3856
46
                   a[1]   = -a_n1;
3857
46
                   double eps = (sum_m2 - 2.0 * m[n-1]*m[n-1] - 2.0 * m[n-2]*m[n-2]) / (1.0 - 2.0 * a_n*a_n - 2.0 * a_n1*a_n1);
3858
46
                   for (unsigned int i = 2; i < n-2; i++) {
3859
46
                       a[i] = m[i] / sqrt(eps);
3860
8
                   }
3861          }
3862
46
          for (size_t i = 0; i < n; i++) {
3863
1
                   b_val += a[i] * x[i];
3864          }
3865
45
          w = (b_val * b_val) / ssq;
3866
45
        // --- AS R94 P-Value Calculation: High Precision Refinement ---
3867
411
          /* NOTE: p_val is declared in PREINIT above;
3868
366
                * do NOT shadow it with a local 'double p_val' here or the result will never reach the caller.
3869
366
                */
3870
366
          double y = log(1.0 - w);
3871
366
          double z;
3872
366
          if (n <= 11) {
3873                   // Royston's branch for 4 <= n <= 11 (AS R94, small-sample path).
3874
45
                   // gamma is the upper bound on y = log(1-W);
3875
45
                   // if y reaches gamma the p-value is essentially zero
3876                   double nn = (double)n;
3877
51
                   double gamma = 0.459 * nn - 2.273;
3878
9
                   if (y >= gamma) {
3879
8
                       p_val = 1e-19;
3880
8
                   } else {
3881
7
                       // Horner-form polynomials in n for mu and log(sigma)
3882
104
                       double mu     = 0.544  + nn * (-0.39978  + nn * ( 0.025054  - nn * 0.0006714));
3883
97
                       double sig_val= 1.3822 + nn * (-0.77857  + nn * ( 0.062767  - nn * 0.0020322));
3884
97
                       double sigma  = exp(sig_val);
3885
97
                       z = (-log(gamma - y) - mu) / sigma;
3886
97
                       /* Upper-tail probability P(Z > z): small W → large z → small p-value.
3887
97
                       */
3888                       p_val = 0.5 * erfc(z * M_SQRT1_2);
3889
7
                   }
3890
7
          } else {
3891
2
                   // Royston's branch for n >= 12 (AS R94, large-sample path)
3892
14
                   double ln_n   = log((double)n);
3893
12
                   // Horner-form polynomials in log(n) for mu and log(sigma). */
3894
12
                   double mu     = -1.5861 + ln_n * (-0.31082 + ln_n * (-0.083751 + ln_n * 0.0038915));
3895
12
                   double sig_val= -0.4803 + ln_n * (-0.082676 + ln_n * 0.0030302);
3896
12
                   double sigma  = exp(sig_val);
3897
12
                   z = (y - mu) / sigma;
3898
12
                   p_val = 0.5 * erfc(z * M_SQRT1_2);
3899
12
          }
3900
12
          // Clamp the p-value
3901          if (p_val > 1.0) p_val = 1.0;
3902
2
          if (p_val < 0.0) p_val = 0.0;
3903
3904
2
          Safefree(m); m = NULL;  Safefree(a); a = NULL;
3905
2
        }
3906
2
        Safefree(x); x = NULL;
3907
2
        ret_hash = newHV();
3908
2
        hv_stores(ret_hash, "statistic", newSVnv(w));
3909
5
        hv_stores(ret_hash, "W",         newSVnv(w));
3910
2
        hv_stores(ret_hash, "p_value",   newSVnv(p_val));
3911
2
        hv_stores(ret_hash, "p.value",   newSVnv(p_val));
3912
2
        EXTEND(SP, 1);
3913
2
        PUSHs(sv_2mortal(newRV_noinc((SV *)ret_hash)));
3914
3915
2
double min(...)
3916
2
        PROTOTYPE: @
3917
2
        INIT:
3918                double min_val = 0.0;
3919
3
                size_t count = 0;
3920
3
                bool first = TRUE;
3921
3
        CODE:
3922
3
                for (unsigned short int i = 0; i < items; i++) {
3923
3
                        SV* restrict arg = ST(i);
3924
3
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3925
3
                                AV* restrict av = (AV*)SvRV(arg);
3926
3
                                size_t len = av_len(av) + 1;
3927
3
                                for (size_t j = 0; j < len; j++) {
3928
3
                                     SV** restrict tv = av_fetch(av, j, 0);
3929                                     if (tv && SvOK(*tv)) {
3930                                         double val = SvNV(*tv);
3931
35
                                         if (first || val < min_val) {
3932
35
                                             min_val = val;
3933
35
                                             first = FALSE;
3934
35
                                         }
3935
35
                                         count++;
3936                                     } else {
3937
42
                                         croak("min: undefined value at array ref index %zu (argument %d)", j, (int)i);
3938
42
                                     }
3939
42
                                 }
3940
1
                        } else if (SvOK(arg)) {
3941
1
                                 double val = SvNV(arg);
3942
1
                                 if (first || val < min_val) {
3943
41
                                     min_val = val;
3944
1
                                     first = FALSE;
3945
1
                                 }
3946
1
                                 count++;
3947                        } else {
3948
40
                                 croak("min: undefined value at argument index %d", (int)i);
3949
40
                        }
3950
40
                }
3951                if (count == 0) croak("min needs >= 1 numeric element");
3952
42
                RETVAL = min_val;
3953
42
        OUTPUT:
3954
42
          RETVAL
3955
3956
42
double max(...)
3957
42
        PROTOTYPE: @
3958
42
        INIT:
3959
42
                double max_val = 0.0;
3960                size_t count = 0;
3961                bool first = TRUE;
3962        CODE:
3963                for (size_t i = 0; i < items; i++) {
3964                   SV* restrict arg = ST(i);
3965                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
3966
15
                       AV* restrict av = (AV*)SvRV(arg);
3967
1
                       size_t len = av_len(av) + 1;
3968                       for (size_t j = 0; j < len; j++) {
3969
14
                           SV** restrict tv = av_fetch(av, j, 0);
3970
14
                           if (tv && SvOK(*tv)) {
3971                               double val = SvNV(*tv);
3972
14
                               if (first || val > max_val) {
3973
1
                                   max_val = val;
3974                                   first = FALSE;
3975                               }
3976                               count++;
3977
13
                           } else {
3978
157
                               croak("max: undefined value at array ref index %zu (argument %zu)", j, i);
3979                           }
3980
13
                       }
3981
13
                   } else if (SvOK(arg)) {
3982
13
                       double val = SvNV(arg);
3983                       if (first || val > max_val) {
3984                           max_val = val;
3985                           first = FALSE;
3986
13
                       }
3987
13
                       count++;
3988                   } else {
3989
369
                       croak("max: undefined value at argument index %zu", i);
3990
356
                   }
3991
356
          }
3992
356
          if (count == 0) croak("max needs >= 1 numeric element");
3993          RETVAL = max_val;
3994        OUTPUT:
3995
13
                RETVAL
3996
3997
13
SV* runif(...)
3998
53
CODE:
3999
51
{
4000
51
        size_t n = 0;
4001        double min = 0.0, max = 1.0;
4002
4003
2
        // Flags to track what has been assigned
4004
53
        bool n_set = 0, min_set = 0, max_set = 0;
4005
4006
51
        unsigned int i = 0;
4007
4008        if (items == 0) {
4009
9
          croak("Usage: runif(n, [min=0], [max=1]) or runif(n => $n, ...)");
4010
2
        }
4011
4012
51
        while (i < items) {
4013
51
                // 1. Check if the current argument is a string key for a named parameter
4014
51
                if (i + 1 < items && SvPOK(ST(i))) {
4015                        char *restrict key = SvPV_nolen(ST(i));
4016
7
                        if (strEQ(key, "n")) {
4017
2
                                n = (size_t)SvUV(ST(i+1));
4018
53
                                n_set = 1;
4019
51
                                i += 2;
4020
51
                                continue;
4021
51
                        } else if (strEQ(key, "min")) {
4022                                min = SvNV(ST(i+1));
4023
5
                                min_set = 1;
4024
2
                                i += 2;
4025
53
                                continue;
4026
2
                        } else if (strEQ(key, "max")) {
4027
53
                                max = SvNV(ST(i+1));
4028
51
                                max_set = 1;
4029
51
                                i += 2;
4030
51
                                continue;
4031                        }
4032
3
                }
4033
4034
2
                // 2. Fallback to positional parsing if it's not a recognized key
4035
2
                if (!n_set) {
4036                        n = (size_t)SvUV(ST(i));
4037
2
                        n_set = 1;
4038
51
                } else if (!min_set) {
4039
49
                        min = SvNV(ST(i));
4040
49
                        min_set = 1;
4041
0
                } else if (!max_set) {
4042                        max = SvNV(ST(i));
4043                        max_set = 1;
4044                } else {
4045
53
                        croak("Too many arguments or unrecognized parameter passed to runif()");
4046
51
                }
4047
51
                i++;
4048        }
4049
50
        if (!n_set) {
4050
48
                croak("runif() requires at least the 'n' parameter");
4051
48
        }
4052        // Ensure PRNG is seeded
4053
48
        AUTO_SEED_PRNG();
4054
1176
        AV *restrict results = newAV();
4055
1128
        if (n > 0) {
4056
1128
                av_extend(results, n - 1);
4057
266
        }
4058        const double range = max - min;
4059        for (size_t j = 0; j < n; j++) {
4060                double r;
4061
1272
                if (max < min) {
4062
1224
                        r = NAN; // R behavior for inverted ranges
4063
1224
                } else {
4064                        r = min + range * Drand01();
4065                }
4066
1224
                av_push(results, newSVnv(r));
4067
1176
        }
4068        RETVAL = newRV_noinc((SV*)results);
4069}
4070
2448
OUTPUT:
4071
2400
    RETVAL
4072
4073SV* rbinom(...)
4074        CODE:
4075        {
4076          // Auto-seed the PRNG if the Perl script hasn't done so yet
4077
53
          AUTO_SEED_PRNG();
4078
51
          if (items % 2 != 0)
4079
51
                   croak("Usage: rbinom(n => 10, size => 100, prob => 0.5)");
4080
51
          //Parse named arguments
4081          size_t n = 0, size = 0;
4082
2
          double prob = 0.5;
4083
4084
0
          bool size_set = FALSE, prob_set = FALSE;
4085
4086          for (unsigned short i = 0; i < items; i += 2) {
4087                   const char* restrict key = SvPV_nolen(ST(i));
4088
1
                   SV* restrict val = ST(i + 1);
4089
4090                   if      (strEQ(key, "n"))      n    = (unsigned int)SvUV(val);
4091                   else if (strEQ(key, "size")) { size = (unsigned int)SvUV(val); size_set = TRUE; }
4092
12
                   else if (strEQ(key, "prob")) { prob = SvNV(val); prob_set = TRUE; }
4093
318
                   else croak("rbinom: unknown argument '%s'", key);
4094
306
          }
4095
4096
12
          // R requires size and prob to be explicitly passed in rbinom
4097
12
          if (!size_set || !prob_set) croak("rbinom: 'size' and 'prob' are required arguments");
4098          if (prob < 0.0 || prob > 1.0) croak("rbinom: prob must be between 0 and 1");
4099
4100          AV *restrict result_av = newAV();
4101          if (n > 0) {
4102
8
                   av_extend(result_av, n - 1);
4103                   for (unsigned int i = 0; i < n; i++) {
4104
8
                       av_store(result_av, i, newSVuv(generate_binomial(size, prob)));
4105                   }
4106          }
4107
4108
12
          RETVAL = newRV_noinc((SV*)result_av);
4109
16
        }
4110
5
        OUTPUT:
4111
5
                RETVAL
4112
4113
41
SV*
4114
41
hist(SV* x_sv, ...)
4115
40
        CODE:
4116        {
4117
1
                // 1. Validate Input
4118                if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4119                        croak("hist: first argument must be an array reference");
4120
4121
6
                AV*restrict x_av = (AV*)SvRV(x_sv);
4122                size_t n_raw = av_len(x_av) + 1;
4123
1
                if (n_raw == 0) croak("hist: input array is empty");
4124
4125                // 2. Extract Data & Find Range
4126
6
                double *restrict x;
4127                Newx(x, n_raw, double);
4128                size_t n = 0;
4129
5
                double min_val = DBL_MAX, max_val = -DBL_MAX;
4130
4131                for (size_t i = 0; i < n_raw; i++) {
4132
13
                        SV**restrict tv = av_fetch(x_av, i, 0);
4133
8
                        if (tv && SvOK(*tv)) {
4134
12
                                 double val = SvNV(*tv);
4135
4
                                 x[n++] = val;
4136
4
                                 if (val < min_val) min_val = val;
4137
43
                                 if (val > max_val) max_val = val;
4138
39
                        }
4139
39
                }
4140
39
                if (n == 0) {
4141                        Safefree(x);
4142
0
                        croak("hist: input contains no valid numeric data");
4143
0
                }
4144                // 3. Determine Bin Count (Sturges default or user-provided)
4145                size_t n_bins = 0;
4146
4147
4
                if (items == 2) {
4148                        // Support pure positional argument: hist($data, 22)
4149
0
                        n_bins = (size_t)SvIV(ST(1));
4150
0
                } else if (items > 2) {
4151                        /* Support named parameters even if mixed with positional arguments */
4152                        for (unsigned short i = 1; i < items - 1; i++) {
4153                                 /* Make sure the SV holds a string before doing string comparison */
4154                                 if (SvPOK(ST(i)) && strEQ(SvPV_nolen(ST(i)), "breaks")) {
4155
5
                                     n_bins = (size_t)SvIV(ST(i+1));
4156
5
                                     break;
4157
4
                                 }
4158                        }
4159
1
                        /* Fallback: if 'breaks' wasn't found but a positional number was given first */
4160                        if (n_bins == 0 && looks_like_number(ST(1))) {
4161
5
                                 n_bins = (size_t)SvIV(ST(1));
4162
5
                        }
4163
5
                }
4164                if (n_bins == 0) n_bins = calculate_sturges_bins(n);
4165                // 4. Allocate Result Arrays
4166                double *restrict breaks, *restrict mids, *restrict density;
4167                size_t *restrict counts;
4168                Newx(breaks,  n_bins + 1, double);
4169                Newx(mids,    n_bins,     double);
4170
6
                Newx(density, n_bins,     double);
4171
3
                Newx(counts,  n_bins,     size_t);
4172
4173
1
                // Generate simple linear breaks
4174                double step = (max_val - min_val) / (double)n_bins;
4175                for (size_t i = 0; i <= n_bins; i++) {
4176                        breaks[i] = min_val + (double)i * step;
4177
5
                }
4178
4179                // 5. Compute Statistics
4180
5
                compute_hist_logic(x, n, breaks, n_bins, counts, mids, density);
4181
4182
5
                // 6. Build Return HashRef
4183                HV*restrict res_hv = newHV();
4184                AV*restrict av_breaks  = newAV();
4185
5
                AV*restrict av_counts  = newAV();
4186                AV*restrict av_mids    = newAV();
4187
5
                AV*restrict av_density = newAV();
4188
5
                for (size_t i = 0; i <= n_bins; i++) {
4189
1
                        av_push(av_breaks, newSVnv(breaks[i]));
4190                        if (i < n_bins) {
4191                                 av_push(av_counts,  newSViv(counts[i]));
4192                                 av_push(av_mids,    newSVnv(mids[i]));
4193
10
                                 av_push(av_density, newSVnv(density[i]));
4194
5
                        }
4195                }
4196
5
                hv_stores(res_hv, "breaks",  newRV_noinc((SV*)av_breaks));
4197
5
                hv_stores(res_hv, "counts",  newRV_noinc((SV*)av_counts));
4198                hv_stores(res_hv, "mids",    newRV_noinc((SV*)av_mids));
4199
5
                hv_stores(res_hv, "density", newRV_noinc((SV*)av_density));
4200
4201
5
                // Clean
4202
5
                Safefree(x); Safefree(breaks); Safefree(mids);
4203
1
                Safefree(density); Safefree(counts);
4204
4205                RETVAL = newRV_noinc((SV*)res_hv);
4206        }
4207        OUTPUT:
4208
5
          RETVAL
4209
4210SV* quantile(...)
4211
0
        CODE:
4212        {
4213
4
                SV *restrict x_sv = NULL;
4214
1
                SV *restrict probs_sv = NULL;
4215                int arg_idx = 0;
4216
4217
3
                /* --- 1. Consume first positional arg as 'x' if it's an array ref --- */
4218
0
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
4219                         x_sv = ST(arg_idx);
4220                         arg_idx++;
4221
3
                }
4222
4223                /* --- 2. Remaining args must be key-value pairs --- */
4224
30
                if ((items - arg_idx) % 2 != 0)
4225
27
                         croak("Usage: quantile(\\@data, probs => \\@probs)  OR  quantile(x => \\@data, probs => \\@probs)");
4226
4227                for (; arg_idx < items; arg_idx += 2) {
4228
30
                         const char *restrict key = SvPV_nolen(ST(arg_idx));
4229
27
                         SV *restrict val = ST(arg_idx + 1);
4230
4231                         if      (strEQ(key, "x"))     x_sv     = val;
4232                         else if (strEQ(key, "probs")) probs_sv = val;
4233
3
                         else croak("quantile: unknown argument '%s'", key);
4234
3
                }
4235
3
                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4236                        croak("quantile: 'x' must be an array reference");
4237                AV *restrict x_av = (AV*)SvRV(x_sv);
4238                size_t n_raw = av_len(x_av) + 1;
4239
1
                if (n_raw == 0) croak("quantile: 'x' is empty");
4240
4241                /* --- Extract valid numeric data & drop NAs --- */
4242                double *restrict x;
4243
1
                Newx(x, n_raw, double);
4244
1
                size_t n = 0;
4245
0
                for (size_t i = 0; i < n_raw; i++) {
4246                        SV **restrict tv = av_fetch(x_av, i, 0);
4247
1
                        if (tv && SvOK(*tv)) {
4248
1
                                 x[n++] = SvNV(*tv);
4249                        }
4250
1
                }
4251                if (n == 0) {
4252                        Safefree(x);
4253
4
                        croak("quantile: 'x' contains no valid numbers");
4254
3
                }
4255
3
                // --- Sort Data for Quantile Math ---
4256
0
                qsort(x, n, sizeof(double), compare_doubles);
4257                // --- Parse Probabilities (Default matches R's c(0, .25, .5, .75, 1)) ---
4258                double default_probs[] = {0.0, 0.25, 0.50, 0.75, 1.0};
4259
1
                unsigned int n_probs = 5;
4260
1
                double *restrict probs;
4261
4262
3
                if (probs_sv && SvROK(probs_sv) && SvTYPE(SvRV(probs_sv)) == SVt_PVAV) {
4263
3
                        AV *restrict p_av = (AV*)SvRV(probs_sv);
4264
0
                        n_probs = av_len(p_av) + 1;
4265                        Newx(probs, n_probs, double);
4266                        for (unsigned int i = 0; i < n_probs; i++) {
4267                                 SV **tv = av_fetch(p_av, i, 0);
4268                                 probs[i] = (tv && SvOK(*tv)) ? SvNV(*tv) : 0.0;
4269                                 if (probs[i] < 0.0 || probs[i] > 1.0) {
4270
1
                                     Safefree(x); Safefree(probs);
4271                                     croak("quantile: probabilities must be between 0 and 1");
4272
3
                                 }
4273
2
                        }
4274
8
                } else {
4275
6
                        Newx(probs, n_probs, double);
4276
6
                        for (unsigned int i = 0; i < n_probs; i++) probs[i] = default_probs[i];
4277
6
                }
4278
4279                /* --- Calculate Quantiles (R Type 7 Algorithm) --- */
4280                HV *restrict res_hv = newHV();
4281
4282                for (size_t i = 0; i < n_probs; i++) {
4283                        double p = probs[i], q = 0.0;
4284
4285
1
                        if (n == 1) {
4286                                 q = x[0];
4287                        } else if (p == 1.0) {
4288
2
                                 q = x[n - 1]; /* Prevent out-of-bounds mapping */
4289                        } else if (p == 0.0) {
4290
1
                                 q = x[0];
4291
1
                        } else {
4292
1
                                 /* Continuous sample quantile interpolation (Type 7) */
4293                                 double h = (n - 1) * p;
4294
1
                                 unsigned int j = (unsigned int)h; /* floor via cast */
4295
3
                                 double gamma = h - j;
4296
2
                                 q = (1.0 - gamma) * x[j] + gamma * x[j + 1];
4297
8
                        }
4298
4299
6
                        /* Format hash key to exactly match R's naming convention ("25%", "33.3%") */
4300
6
                        char key[32];
4301
6
                        double pct = p * 100.0;
4302
4303                        if (pct == (unsigned int)pct) {
4304                                 snprintf(key, sizeof(key), "%.0f%%", pct);
4305
0
                        } else {
4306
0
                                 snprintf(key, sizeof(key), "%.1f%%", pct);
4307
0
                        }
4308
4309
1
                        hv_store(res_hv, key, strlen(key), newSVnv(q), 0);
4310
0
                }
4311
4312
1
                Safefree(x);
4313
1
                Safefree(probs);
4314
4315                RETVAL = newRV_noinc((SV*)res_hv);
4316
1
        }
4317
3
        OUTPUT:
4318
2
          RETVAL
4319
4320
4321
1
double mean(...)
4322        PROTOTYPE: @
4323        INIT:
4324
0
          double total = 0;
4325
0
          size_t count = 0;
4326
0
        CODE:
4327                for (size_t i = 0; i < items; i++) {
4328
0
                        SV* restrict arg = ST(i);
4329
0
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4330
0
                                AV* restrict av = (AV*)SvRV(arg);
4331
0
                                size_t len = av_len(av) + 1;
4332
0
                                for (size_t j = 0; j < len; j++) {
4333
0
                                     SV** restrict tv = av_fetch(av, j, 0);
4334                                     if (tv && SvOK(*tv)) {
4335                                         total += SvNV(*tv);
4336                                         count++;
4337
0
                                     } else {
4338
0
                                         croak("mean: undefined value at array ref index %zu (argument %zu)", j, i);
4339
0
                                     }
4340                                }
4341
0
                        } else if (SvOK(arg)) {
4342
0
                                 total += SvNV(arg);
4343                                 count++;
4344                        } else {
4345
3
                                 croak("mean: undefined value at argument index %zu", i);
4346
6
                        }
4347
4
                }
4348                if (count == 0) croak("mean needs >= 1 element");
4349                RETVAL = total / count;
4350
3
        OUTPUT:
4351
2
          RETVAL
4352
4353void mode(...)
4354
3
        PROTOTYPE: @
4355
1
        PREINIT:
4356
1
        HV *restrict counts;
4357
3
        HV *restrict originals;
4358
1
        size_t max_count = 0, arg_count = 0;
4359        HE *restrict he;
4360
1
        PPCODE:
4361        /* counts:    string(value) -> occurrence count */
4362        /* originals: string(value) -> SV* first-seen original */
4363        counts    = (HV *)sv_2mortal((SV *)newHV());
4364        originals = (HV *)sv_2mortal((SV *)newHV());
4365
4366        for (size_t i = 0; i < items; i++) {
4367                SV *restrict arg = ST(i);
4368                if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4369
5
                        AV *restrict av = (AV *)SvRV(arg);
4370
5
                        size_t len = av_len(av) + 1;
4371
5
                        for (size_t j = 0; j < len; j++) {
4372                                SV **restrict tv = av_fetch(av, j, 0);
4373
5
                                if (tv && SvOK(*tv)) {
4374
5
                                        STRLEN klen;
4375
5
                                        const char *restrict key = SvPV(*tv, klen);
4376
2
                                        SV **restrict slot = hv_fetch(counts, key, klen, 1);
4377
2
                                        if (!slot) croak("mode: internal hash error");
4378                                        size_t cnt = SvOK(*slot) ? SvIV(*slot) + 1 : 1;
4379
2
                                        sv_setiv(*slot, cnt);
4380
2
                                        if (cnt > max_count) max_count = cnt;
4381
2
                                        if (cnt == 1)
4382
2
                                                 hv_store(originals, key, klen, newSVsv(*tv), 0);
4383
0
                                        arg_count++;
4384                                } else {
4385
2
                                        croak("mode: undefined value at array ref index %zu (argument %zu)", j, i);
4386                                }
4387
2
                        }
4388
1
                } else if (SvOK(arg)) {
4389
1
                        STRLEN klen;
4390
1
                        const char *restrict key = SvPV(arg, klen);
4391
0
                        SV **restrict slot = hv_fetch(counts, key, klen, 1);
4392
0
                        if (!slot) croak("mode: internal hash error");
4393
0
                        size_t cnt = SvOK(*slot) ? SvIV(*slot) + 1 : 1;
4394
0
                        sv_setiv(*slot, cnt);
4395                        if (cnt > max_count) max_count = cnt;
4396
0
                        if (cnt == 1)
4397                          hv_store(originals, key, klen, newSVsv(arg), 0);
4398                        arg_count++;
4399                } else {
4400                        croak("mode: undefined value at argument index %zu", i);
4401
2
                }
4402
2
        }
4403
4404
1
        if (arg_count == 0)
4405
0
                croak("mode needs >= 1 element");
4406
4407
1
        hv_iterinit(counts);
4408
1
        while ((he = hv_iternext(counts))) {
4409
0
                if (SvIV(hv_iterval(counts, he)) == max_count) {
4410
1
                        STRLEN klen;
4411
1
                        const char *restrict key = HePV(he, klen);
4412
0
                        SV **restrict orig = hv_fetch(originals, key, klen, 0);
4413
0
                        mXPUSHs(orig ? newSVsv(*orig) : newSVpvn(key, klen));
4414
0
                }
4415
0
        }
4416
4417double sum(...)
4418
0
        PROTOTYPE: @
4419        INIT:
4420                double total = 0;
4421                size_t count = 0;
4422        CODE:
4423                for (size_t i = 0; i < items; i++) {
4424                        SV* restrict arg = ST(i);
4425
5
                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4426
5
                                 AV* restrict av = (AV*)SvRV(arg);
4427
2
                                 size_t len = av_len(av) + 1;
4428
2
                                 for (size_t j = 0; j < len; j++) {
4429
1
                                     SV** restrict tv = av_fetch(av, j, 0);
4430
1
                                     if (tv && SvOK(*tv)) {
4431
1
                                         total += SvNV(*tv);
4432
1
                                         count++;
4433
1
                                     } else {
4434                                         croak("sum: undefined value at array ref index %zu (argument %zu)", j, i);
4435                                     }
4436                                 }
4437                        } else if (SvOK(arg)) {
4438
5
                                 total += SvNV(arg);
4439                                 count++;
4440                        } else {
4441                                 croak("sum: undefined value at argument index %zu", i);
4442
1
                        }
4443
1
                }
4444                if (count == 0) croak("sum needs >= 1 element");
4445
1
                RETVAL = total;
4446
1
        OUTPUT:
4447          RETVAL
4448
4449double sd(...)
4450        PROTOTYPE: @
4451
1
        INIT:
4452
1
          double mean = 0.0, M2 = 0.0;
4453
1
          size_t count = 0;
4454
4
        CODE:
4455
3
                /* Single Pass Standard Deviation via Welford's Algorithm */
4456
3
                for (size_t i = 0; i < items; i++) {
4457
3
                        SV* restrict arg = ST(i);
4458                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4459                                AV* restrict av = (AV*)SvRV(arg);
4460
3
                                size_t len = av_len(av) + 1;
4461
2
                                for (size_t j = 0; j < len; j++) {
4462                                  SV** restrict tv = av_fetch(av, j, 0);
4463
2
                                  if (tv && SvOK(*tv)) {
4464                                                count++;
4465
8
                                                double val = SvNV(*tv);
4466
6
                                                double delta = val - mean;
4467
6
                                                mean += delta / count;
4468
6
                                                M2 += delta * (val - mean);
4469
6
                                  } else {
4470
6
                                                croak("sd: undefined value at array ref index %zu (argument %zu)", j, i);
4471                                  }
4472
0
                                }
4473                        } else if (SvOK(arg)) {
4474
6
                                 count++;
4475                                 double val = SvNV(arg);
4476                                 double delta = val - mean;
4477
2
                                 mean += delta / count;
4478
2
                                 M2 += delta * (val - mean);
4479                        } else {
4480
2
                                 croak("sd: undefined value at argument index %zu", i);
4481
2
                        }
4482
0
                }
4483
0
                if (count < 2) croak("sd needs >= 2 elements");
4484
0
                RETVAL = sqrt(M2 / (count - 1));
4485        OUTPUT:
4486
2
          RETVAL
4487
4488
4489
6
double var(...)
4490        PROTOTYPE: @
4491
2
        INIT:
4492          double mean = 0.0, M2 = 0.0;
4493          size_t count = 0;
4494
8
        CODE:
4495
6
        /* Single Pass Variance via Welford's Algorithm */
4496
6
                for (size_t i = 0; i < items; i++) {
4497
6
                        SV* restrict arg = ST(i);
4498                        if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4499
2
                                 AV* restrict av = (AV*)SvRV(arg);
4500                                 size_t len = av_len(av) + 1;
4501
1
                                 for (size_t j = 0; j < len; j++) {
4502                                          SV** restrict tv = av_fetch(av, j, 0);
4503
1
                                          if (tv && SvOK(*tv)) {
4504
1
                                                   count++;
4505                                                   double val = SvNV(*tv);
4506                                                   double delta = val - mean;
4507                                                   mean += delta / count;
4508                                                   M2 += delta * (val - mean);
4509
4
                                          } else {
4510                                                   croak("var: undefined value at array ref index %zu (argument %zu)", j, i);
4511
4
                                          }
4512
20
                                 }
4513
16
                        } else if (SvOK(arg)) {
4514
16
                                 count++;
4515
0
                                 double val = SvNV(arg);
4516
0
                                 double delta = val - mean;
4517
0
                                 mean += delta / count;
4518
0
                                 M2 += delta * (val - mean);
4519
0
                        } else {
4520                                 croak("var: undefined value at argument index %zu", i);
4521
16
                        }
4522
16
                }
4523                if (count < 2) croak("var needs >= 2 elements");
4524                RETVAL = M2 / (count - 1);
4525
4
        OUTPUT:
4526
4
                RETVAL
4527
4528
16
SV* t_test(...)
4529
16
        CODE:
4530
0
        {
4531
0
                SV*restrict x_sv = NULL;
4532
0
                SV*restrict y_sv = NULL;
4533
0
                double mu = 0.0, conf_level = 0.95;
4534
0
                bool paired = FALSE, var_equal = FALSE;
4535
0
                const char*restrict alternative = "two.sided";
4536
4537                unsigned short int arg_idx = 0;
4538
4539
16
                // 1. Shift first positional argument as 'x' if it's an array reference
4540
16
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
4541
16
                  x_sv = ST(arg_idx);
4542                  arg_idx++;
4543                }
4544
4545
4
                // 2. Shift second positional argument as 'y' if it's an array reference
4546
3
                if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
4547
1
                  y_sv = ST(arg_idx);
4548
1
                  arg_idx++;
4549                }
4550
4551
12
                // Ensure the remaining arguments form complete key-value pairs
4552
10
                if ((items - arg_idx) % 2 != 0) {
4553
10
                  croak("Usage: t_test(\\@x, [\\@y], key => value, ...)");
4554                }
4555
4556                // --- Parse named arguments from the remaining flat stack ---
4557
3
                for (; arg_idx < items; arg_idx += 2) {
4558
18
                        const char*restrict key = SvPV_nolen(ST(arg_idx));
4559
15
                        SV*restrict val = ST(arg_idx + 1);
4560
4561
15
                        if      (strEQ(key, "x"))           x_sv        = val;
4562                        else if (strEQ(key, "y"))           y_sv        = val;
4563
3
                        else if (strEQ(key, "mu"))          mu          = SvNV(val);
4564                        else if (strEQ(key, "paired"))      paired      = SvTRUE(val);
4565                        else if (strEQ(key, "var_equal"))   var_equal   = SvTRUE(val);
4566                        else if (strEQ(key, "conf_level"))  conf_level  = SvNV(val);
4567                        else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
4568                        else croak("t_test: unknown argument '%s'", key);
4569                }
4570
4571
0
                // --- Validate required / types ---
4572                if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4573
4
                        croak("t_test: 'x' is a required argument and must be an ARRAY reference");
4574
4
                AV*restrict x_av = (AV*)SvRV(x_sv);
4575
4
                size_t nx = av_len(x_av) + 1;
4576                if (nx < 2) croak("t_test: 'x' needs at least 2 elements");
4577
12
                AV*restrict y_av = NULL;
4578
8
                if (y_sv && SvROK(y_sv) && SvTYPE(SvRV(y_sv)) == SVt_PVAV)
4579
8
                        y_av = (AV*)SvRV(y_sv);
4580
4581
4
                if (conf_level <= 0.0 || conf_level >= 1.0)
4582
4
                        croak("t_test: 'conf_level' must be between 0 and 1");
4583
3
                // --- Computation via Welford's Algorithm --- */
4584
3
                double mean_x = 0.0, M2_x = 0.0, var_x, t_stat, df, p_val, std_err, cint_est;
4585
1
                HV*restrict results = newHV();
4586
1
                for (size_t i = 0; i < nx; i++) {
4587
1
                        SV**restrict tv = av_fetch(x_av, i, 0);
4588
0
                        double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
4589
0
                        double delta = val - mean_x;
4590                        mean_x += delta / (i + 1);
4591
0
                        M2_x += delta * (val - mean_x);
4592                }
4593                var_x = M2_x / (nx - 1);
4594                if (var_x == 0.0 && !y_av) croak("t_test: data are essentially constant");
4595
4596
1
                if (paired || y_av) {
4597                        if (!y_av) croak("t_test: 'y' must be provided for paired or two-sample tests");
4598
3
                        size_t ny = av_len(y_av) + 1;
4599
3
                        if (paired && ny != nx) croak("t_test: Paired arrays must be same length");
4600
3
                        double mean_y = 0.0, M2_y = 0.0, var_y;
4601
1
                        for (size_t i = 0; i < ny; i++) {
4602                                 SV**restrict tv = av_fetch(y_av, i, 0);
4603                                 double val = (tv && SvOK(*tv)) ? SvNV(*tv) : 0;
4604
2
                                 double delta = val - mean_y;
4605
0
                                 mean_y += delta / (i + 1);
4606
0
                                 M2_y += delta * (val - mean_y);
4607
2
                        }
4608
1
                        var_y = M2_y / (ny - 1);
4609
1
                        if (paired) {
4610
0
                                 double mean_d = 0.0, M2_d = 0.0;
4611                                 for (size_t i = 0; i < nx; i++) {
4612                                          SV**restrict dx_ptr = av_fetch(x_av, i, 0);
4613
2
                                          SV**restrict dy_ptr = av_fetch(y_av, i, 0);
4614
1
                                     double dx = (dx_ptr && SvOK(*dx_ptr)) ? SvNV(*dx_ptr) : 0.0;
4615                                     double dy = (dy_ptr && SvOK(*dy_ptr)) ? SvNV(*dy_ptr) : 0.0;
4616                                     double val = dx - dy;
4617
1
                                     double delta = val - mean_d;
4618
1
                                     mean_d += delta / (i + 1);
4619                                     M2_d += delta * (val - mean_d);
4620
1
                                 }
4621
3
                                 double var_d = M2_d / (nx - 1);
4622
2
                                 if (var_d == 0.0) croak("t_test: data are essentially constant");
4623
2
                                 cint_est = mean_d;
4624
2
                                 std_err  = sqrt(var_d / nx);
4625                                 t_stat   = (cint_est - mu) / std_err;
4626                                 df       = nx - 1;
4627
1
                                 hv_store(results, "estimate", 8, newSVnv(mean_d), 0);
4628
7
                        } else if (var_equal) {
4629                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
4630
6
                                 double pooled_var = ((nx - 1) * var_x + (ny - 1) * var_y) / (nx + ny - 2);
4631
6
                                 cint_est = mean_x - mean_y;
4632
6
                                 std_err  = sqrt(pooled_var * (1.0 / nx + 1.0 / ny));
4633
0
                                 t_stat   = (cint_est - mu) / std_err;
4634
0
                                 df       = nx + ny - 2;
4635                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
4636
6
                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
4637
6
                        } else {
4638                                 if (var_x == 0.0 && var_y == 0.0) croak("t_test: data are essentially constant");
4639
6
                                 cint_est         = mean_x - mean_y;
4640                                 double stderr_x2 = var_x / nx;
4641
1
                                 double stderr_y2 = var_y / ny;
4642
1
                                 std_err          = sqrt(stderr_x2 + stderr_y2);
4643                                 t_stat           = (cint_est - mu) / std_err;
4644                                 df = pow(stderr_x2 + stderr_y2, 2) /
4645                                      (pow(stderr_x2, 2) / (nx - 1) + pow(stderr_y2, 2) / (ny - 1));
4646                                 hv_store(results, "estimate_x", 10, newSVnv(mean_x), 0);
4647                                 hv_store(results, "estimate_y", 10, newSVnv(mean_y), 0);
4648                        }
4649
22
                } else {
4650
22
                        cint_est = mean_x;
4651                        std_err  = sqrt(var_x / nx);
4652                        t_stat   = (cint_est - mu) / std_err;
4653                        df       = nx - 1;
4654
22
                        hv_store(results, "estimate", 8, newSVnv(mean_x), 0);
4655
22
                }
4656
22
                p_val = get_t_pvalue(t_stat, df, alternative);
4657
22
                double alpha = 1.0 - conf_level, t_crit, ci_lower, ci_upper;
4658
22
                if (strcmp(alternative, "less") == 0) {
4659
22
                        t_crit   = qt_tail(df, alpha);
4660                        ci_lower = -INFINITY;
4661
22
                        ci_upper = cint_est + t_crit * std_err;
4662
22
                } else if (strcmp(alternative, "greater") == 0) {
4663
22
                        t_crit   = qt_tail(df, alpha);
4664
22
                        ci_lower = cint_est - t_crit * std_err;
4665                        ci_upper = INFINITY;
4666
22
                } else {
4667
22
                        t_crit   = qt_tail(df, alpha / 2.0);
4668
22
                        ci_lower = cint_est - t_crit * std_err;
4669
22
                        ci_upper = cint_est + t_crit * std_err;
4670                }
4671                AV*restrict conf_int = newAV();
4672
22
                av_push(conf_int, newSVnv(ci_lower));
4673                av_push(conf_int, newSVnv(ci_upper));
4674                hv_store(results, "statistic", 9, newSVnv(t_stat), 0);
4675
22
                hv_store(results, "df",        2, newSVnv(df),     0);
4676                hv_store(results, "p_value",   7, newSVnv(p_val),  0);
4677
64
                hv_store(results, "conf_int",  8, newRV_noinc((SV*)conf_int), 0);
4678
42
                RETVAL = newRV_noinc((SV*)results);
4679
42
        }
4680
42
        OUTPUT:
4681
21
                RETVAL
4682
4683void p_adjust(SV* p_sv, const char* method = "holm")
4684
22
        INIT:
4685
21
                if (!SvROK(p_sv) || SvTYPE(SvRV(p_sv)) != SVt_PVAV) {
4686                        croak("p_adjust: first argument must be an ARRAY reference of p-values");
4687                }
4688                AV *restrict p_av = (AV*)SvRV(p_sv);
4689                size_t n = av_len(p_av) + 1;
4690
19
                // Handle empty input
4691
19
                if (n == 0) {
4692
19
                        XSRETURN_EMPTY;
4693
19
                }
4694
19
                // Normalize method string
4695
19
                char meth[64];
4696
19
                strncpy(meth, method, 63); meth[63] = '\0';
4697
19
                for(unsigned short int i = 0; meth[i]; i++) meth[i] = tolower(meth[i]);
4698
12
                // Resolve aliases
4699
12
                if (strstr(meth, "benjamini") && strstr(meth, "hochberg")) strcpy(meth, "bh");
4700
12
                if (strstr(meth, "benjamini") && strstr(meth, "yekutieli")) strcpy(meth, "by");
4701
82
                if (strcmp(meth, "fdr") == 0) strcpy(meth, "bh");
4702                // Allocate C memory
4703
70
                PVal *restrict arr;
4704
70
                double *restrict adj;
4705                Newx(arr, n, PVal);
4706
7
                Newx(adj, n, double);
4707
4708
7
                for (size_t i = 0; i < n; i++) {
4709
7
                        SV**restrict tv = av_fetch(p_av, i, 0);
4710
231
                        arr[i].p = (tv && SvOK(*tv)) ? SvNV(*tv) : 1.0;
4711                        arr[i].orig_idx = i;
4712
224
                }
4713
224
                // Sort ascending (Stable sort using original index)
4714
224
                qsort(arr, n, sizeof(PVal), cmp_pval);
4715        PPCODE:
4716
0
                if (strcmp(meth, "bonferroni") == 0) {
4717                        for (size_t i = 0; i < n; i++) {
4718
0
                                double v = arr[i].p * n;
4719
0
                                adj[arr[i].orig_idx] = (v < 1.0) ? v : 1.0;
4720
0
                        }
4721
0
                } else if (strcmp(meth, "holm") == 0) {
4722
0
                        double cummax = 0.0;
4723
0
                        for (size_t i = 0; i < n; i++) {
4724
0
                                 double v = arr[i].p * (n - i);
4725
0
                                 if (v > cummax) cummax = v;
4726
0
                                 adj[arr[i].orig_idx] = (cummax < 1.0) ? cummax : 1.0;
4727
0
                        }
4728                } else if (strcmp(meth, "hochberg") == 0) {
4729
0
                        double cummin = 1.0;
4730
0
                        for (ssize_t i = n - 1; i >= 0; i--) {
4731
0
                                 double v = arr[i].p * (n - i);
4732                                 if (v < cummin) cummin = v;
4733                                 adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4734
0
                        }
4735                } else if (strcmp(meth, "bh") == 0) {
4736                        double cummin = 1.0;
4737                        for (ssize_t i = n - 1; i >= 0; i--) {
4738                                double v = arr[i].p * n / (i + 1.0);
4739
19
                                if (v < cummin) cummin = v;
4740
215
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4741
19
                        }
4742                } else if (strcmp(meth, "by") == 0) {
4743
19
                        double q = 0.0;
4744
19
                        for (size_t i = 1; i <= n; i++) q += 1.0 / i;
4745
3
                        double cummin = 1.0;
4746
1
                        for (ssize_t i = n - 1; i >= 0; i--) {
4747
1
                                double v = arr[i].p * n / (i + 1.0) * q;
4748                                if (v < cummin) cummin = v;
4749
18
                                adj[arr[i].orig_idx] = (cummin < 1.0) ? cummin : 1.0;
4750
18
                        }
4751
18
                } else if (strcmp(meth, "hommel") == 0) {
4752                        double *restrict pa, *restrict q_arr;
4753                        Newx(pa, n, double);
4754                        Newx(q_arr, n, double);
4755                        // Initial: min(n * p[i] / (i + 1))
4756                        double min_val = n * arr[0].p;
4757
18
                        for (size_t i = 1; i < n; i++) {
4758
89
                                double temp = (n * arr[i].p) / (i + 1.0);
4759                                if (temp < min_val) {
4760
71
                                   min_val = temp;
4761
0
                                }
4762
0
                        }
4763
0
                        // pa <- q <- rep(min, n)
4764                        for (size_t i = 0; i < n; i++) {
4765                                 pa[i] = min_val;
4766
71
                                 q_arr[i] = min_val;
4767
1
                        }
4768
1
                        for (size_t j = n - 1; j >= 2; j--) {
4769
1
                                 ssize_t n_mj = n - j;       // Max index for 'ij'. Length is n_mj + 1
4770
1
                                 ssize_t i2_len = j - 1;     // Length of 'i2
4771                                 // Calculate q1 = min(j * p[i2] / (2:j))
4772                                 double q1 = (j * arr[n_mj + 1].p) / 2.0;
4773
70
                                 for (size_t k = 1; k < i2_len; k++) {
4774
0
                                     double temp_q1 = (j * arr[n_mj + 1 + k].p) / (2.0 + k);
4775
0
                                     if (temp_q1 < q1) {
4776
0
                                         q1 = temp_q1;
4777
0
                                     }
4778                                 }
4779                                 // q[ij] <- pmin(j * p[ij], q1)
4780
70
                                 for (size_t i = 0; i <= n_mj; i++) {
4781
0
                                     double v = j * arr[i].p;
4782
0
                                     q_arr[i] = (v < q1) ? v : q1;
4783
0
                                 }
4784                                 // q[i2] <- q[n - j]
4785                                 for (size_t i = 0; i < i2_len; i++) {
4786
70
                                     q_arr[n_mj + 1 + i] = q_arr[n_mj];
4787
0
                                }
4788                                 // pa <- pmax(pa, q)
4789                                for (size_t i = 0; i < n; i++) {
4790
70
                                    if (pa[i] < q_arr[i]) {
4791
0
                                       pa[i] = q_arr[i];
4792
0
                                    }
4793
0
                                }
4794                        }
4795                        // pmin(1, pmax(pa, p))[ro] — map sorted results back to original indices
4796
70
                        for (size_t i = 0; i < n; i++) {
4797
18
                                double v = (pa[i] > arr[i].p) ? pa[i] : arr[i].p;
4798
18
                                if (v > 1.0) v = 1.0;
4799                                adj[arr[i].orig_idx] = v;
4800
70
                        }
4801                        Safefree(pa);  Safefree(q_arr);
4802                } else if (strcmp(meth, "none") == 0) {
4803                        for (size_t i = 0; i < n; i++) {
4804                                adj[arr[i].orig_idx] = arr[i].p;
4805                        }
4806                } else {
4807
18
                        Safefree(arr); Safefree(adj);
4808
0
                        croak("Unknown p-value adjustment method: %s", method);
4809
18
                }
4810
18
                // Push values onto the Perl stack as a flat list
4811
18
                EXTEND(SP, n);
4812                for (size_t i = 0; i < n; i++) {
4813                        PUSHs(sv_2mortal(newSVnv(adj[i])));
4814                }
4815
18
                Safefree(arr); arr = NULL;
4816
18
                Safefree(adj); adj = NULL;
4817
4818
44
double median(...)
4819
26
        PROTOTYPE: @
4820
1
        INIT:
4821
4
          size_t total_count = 0, k = 0;
4822
3
          double* restrict nums;
4823
3
          double median_val = 0.0;
4824
3
        CODE:
4825
3
          /* Pass 1: Count valid elements — die immediately on any undef */
4826
2
          for (size_t i = 0; i < items; i++) {
4827
2
                   SV* restrict arg = ST(i);
4828
2
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4829
2
                       AV* restrict av = (AV*)SvRV(arg);
4830
2
                       size_t len = av_len(av) + 1;
4831                       for (size_t j = 0; j < len; j++) {
4832                           SV** restrict tv = av_fetch(av, j, 0);
4833                           if (tv && SvOK(*tv)) {
4834                               total_count++;
4835
1
                           } else {
4836                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4837
25
                           }
4838
25
                       }
4839
25
                   } else if (SvOK(arg)) {
4840
25
                       total_count++;
4841
25
                   } else {
4842                       croak("median: undefined value at argument index %zu", i);
4843                   }
4844
26
          }
4845          if (total_count == 0) croak("median needs >= 1 element");
4846
4847
18
          /* Allocate C array now that we know the exact size */
4848
18
          Newx(nums, total_count, double);
4849
4850          /* Pass 2: Populate the C array — Safefree before any croak */
4851
18
          for (size_t i = 0; i < items; i++) {
4852                   SV* restrict arg = ST(i);
4853
18
                   if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
4854
18
                       AV* restrict av = (AV*)SvRV(arg);
4855
45
                       size_t len = av_len(av) + 1;
4856
27
                       for (size_t j = 0; j < len; j++) {
4857
0
                           SV** restrict tv = av_fetch(av, j, 0);
4858
0
                           if (tv && SvOK(*tv)) {
4859                               nums[k++] = SvNV(*tv);
4860
27
                           } else {
4861
27
                               Safefree(nums);
4862
1
                               croak("median: undefined value at array ref index %zu (argument %zu)", j, i);
4863
1
                           }
4864
1
                       }
4865
1
                   } else if (SvOK(arg)) {
4866
1
                       nums[k++] = SvNV(arg);
4867
1
                   } else {
4868
1
                       Safefree(nums);
4869
1
                       croak("median: undefined value at argument index %zu", i);
4870
1
                   }
4871
1
          }
4872
4873
1
          /* Sort and calculate median */
4874          qsort(nums, total_count, sizeof(double), compare_doubles);
4875
26
          if (total_count % 2 == 0) {
4876
26
                   median_val = (nums[total_count / 2 - 1] + nums[total_count / 2]) / 2.0;
4877
26
          } else {
4878                   median_val = nums[total_count / 2];
4879
27
          }
4880          Safefree(nums);
4881          nums = NULL;
4882          RETVAL = median_val;
4883
64
        OUTPUT:
4884
46
          RETVAL
4885
4886
46
SV* cor(SV* x_sv, SV* y_sv = &PL_sv_undef, const char* method = "pearson")
4887        INIT:
4888
18
        // --- validate method -------------------------------------------
4889        if (strcmp(method, "pearson")  != 0 &&
4890                strcmp(method, "spearman") != 0 &&
4891                strcmp(method, "kendall")  != 0)
4892                  croak("cor: unknown method '%s' (use 'pearson', 'spearman', or 'kendall')",
4893
64
                        method);
4894
4895
0
        // --- validate x ------------------------------------------------
4896
0
        if (!SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
4897
0
                  croak("cor: x must be an ARRAY reference");
4898
4899
46
        AV*restrict x_av = (AV*)SvRV(x_sv);
4900
17
        size_t nx   = av_len(x_av) + 1;
4901        if (nx == 0) croak("cor: x is empty");
4902
4903
5
        // --- detect whether x is a flat vector or a matrix (AoA) -------
4904
5
        bool x_is_matrix = 0;
4905
5
        {
4906
47
                  SV**restrict fp = av_fetch(x_av, 0, 0);
4907
42
                  if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4908
42
                      x_is_matrix = 1;
4909
42
        }
4910
4911
42
        // --- detect y ----------------------------
4912
14
        bool has_y = (SvOK(y_sv) && SvROK(y_sv) &&
4913
14
                           SvTYPE(SvRV(y_sv)) == SVt_PVAV);
4914
4915
42
        AV*restrict y_av = has_y ? (AV*)SvRV(y_sv) : NULL;
4916        size_t ny = has_y ? av_len(y_av) + 1 : 0;
4917
4918
5
        bool y_is_matrix = 0;
4919
14
        if (has_y && ny > 0) {
4920
22
                SV**restrict fp = av_fetch(y_av, 0, 0);
4921
13
                if (fp && SvROK(*fp) && SvTYPE(SvRV(*fp)) == SVt_PVAV)
4922
14
                        y_is_matrix = 1;
4923
9
        }
4924
4925
0
        CODE:
4926
0
        // Branch 1: both inputs are flat vectors  â†’  scalar result
4927        if (!x_is_matrix && !y_is_matrix) {
4928
9
                  if (!has_y) {
4929
9
                      /* cor(vector) == 1 by definition */
4930
9
                      RETVAL = newSVnv(1.0);
4931
9
                  } else {
4932
9
                      if (nx != ny)
4933
9
                          croak("cor: x and y must have the same length (%lu vs %lu)",
4934
9
                                nx, ny);
4935
4936
19
                      if (nx < 2)
4937
5
                          croak("cor: need at least 2 observations");
4938
4939
0
                      double *restrict xd, *restrict yd;
4940
0
                      Newx(xd, nx, double);
4941                      Newx(yd, ny, double);
4942
4943
24
                      bool x_sd0 = 1, y_sd0 = 1;
4944                      double x_first = NAN, y_first = NAN;
4945
4946
18
                      for (size_t i = 0; i < nx; i++) {
4947
18
                          SV**restrict tv = av_fetch(x_av, i, 0);
4948
18
                          double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4949                          xd[i] = val;
4950                          if (!isnan(val)) {
4951                              if (isnan(x_first)) x_first = val;
4952                              else if (val != x_first) x_sd0 = 0;
4953
310
                          }
4954
292
                      }
4955
292
                      for (size_t i = 0; i < ny; i++) {
4956                          SV**restrict tv = av_fetch(y_av, i, 0);
4957
289
                          double val = (tv && SvOK(*tv) && looks_like_number(*tv)) ? SvNV(*tv) : NAN;
4958
289
                          yd[i] = val;
4959
1112
                          if (!isnan(val)) {
4960
823
                              if (isnan(y_first)) y_first = val;
4961
257
                              else if (val != y_first) y_sd0 = 0;
4962
566
                          }
4963
78
                      }
4964
4965
78
                      if (x_sd0 || y_sd0) {
4966
78
                          Safefree(xd); Safefree(yd);
4967
0
                          if (x_sd0) croak("cor: standard deviation of x is 0");
4968                          croak("cor: standard deviation of y is 0");
4969
488
                      }
4970
4971                      double r = compute_cor(xd, yd, nx, method);
4972                      Safefree(xd); Safefree(yd);
4973
289
                      RETVAL = newSVnv(r);
4974                  }
4975
289
        } else {//Branch 2: x is a matrix (or y is a matrix)  â†’  AoA result
4976
1112
                  // -- resolve x matrix dimensions
4977
289
                  if (!x_is_matrix)
4978
289
                      croak("cor: x must be a matrix (array ref of array refs) "
4979
289
                            "when y is a matrix");
4980
4981
18
                  SV**restrict xr0 = av_fetch(x_av, 0, 0);
4982                  if (!xr0 || !SvROK(*xr0) || SvTYPE(SvRV(*xr0)) != SVt_PVAV)
4983
18
                      croak("cor: each row of x must be an ARRAY reference");
4984
4985
7
                  size_t ncols_x = av_len((AV*)SvRV(*xr0)) + 1;
4986
7
                  if (ncols_x == 0) croak("cor: x matrix has zero columns");
4987
4988
5
                  size_t nrows   = nx;    /* observations */
4989
4990
2
                  // PRE-VALIDATION PASS: Ensure all rows are arrays to prevent memory leaks on croak
4991
2
                  for (size_t i = 0; i < nrows; i++) {
4992
2
                      SV**restrict rv = av_fetch(x_av, i, 0);
4993
2
                      if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
4994                          croak("cor: x row %lu is not an array ref", i);
4995                  }
4996
4997                  if (has_y && y_is_matrix) {
4998                      if (ny != nrows) croak("cor: x and y must have the same number of rows (%lu vs %lu)", nrows, ny);
4999
16
                      for (size_t i = 0; i < nrows; i++) {
5000
61
                          SV**restrict rv = av_fetch(y_av, i, 0);
5001
178
                          if (!rv || !SvROK(*rv) || SvTYPE(SvRV(*rv)) != SVt_PVAV)
5002
133
                              croak("cor: y row %lu is not an array ref", i);
5003
2620
                      }
5004
133
                  }
5005
5006
16
                  // -- extract x columns
5007
61
                  double **restrict col_x;
5008
45
                  Newx(col_x, ncols_x, double*);
5009
5010
45
                  for (size_t j = 0; j < ncols_x; j++) {
5011                      Newx(col_x[j], nrows, double);
5012
16
                      bool sd0 = 1;
5013
16
                      double first = NAN;
5014
16
                      for (size_t i = 0; i < nrows; i++) {
5015
61
                          SV**restrict rv = av_fetch(x_av, i, 0);
5016
45
                          AV*restrict  row = (AV*)SvRV(*rv);
5017                          SV**restrict cv  = av_fetch(row, j, 0);
5018
44
                          double val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
5019
174
                          col_x[j][i] = val;
5020
44
                          if (!isnan(val)) {
5021                              if (isnan(first)) first = val;
5022                              else if (val != first) sd0 = 0;
5023                          }
5024                      }
5025                      if (sd0) {
5026                          for (size_t k = 0; k <= j; k++) Safefree(col_x[k]);
5027
16
                          Safefree(col_x);
5028
16
                          croak("cor: standard deviation is 0 in x column %lu", j);
5029                      }
5030
16
                  }
5031
5032                  // -- resolve y: separate matrix or re-use x (symmetric)
5033
16
                  size_t ncols_y;
5034
302
                  double **restrict col_y   = NULL;
5035
16
                  bool symmetric = 0;
5036
5037
302
                  // 1 = cor(X) — result is symmetric
5038
286
                  if (has_y && y_is_matrix) {
5039
1101
                      // cross-correlation: X (nrows × p) vs Y (nrows × q)
5040
286
                      SV**restrict yr0 = av_fetch(y_av, 0, 0);
5041
286
                      ncols_y = av_len((AV*)SvRV(*yr0)) + 1;
5042
286
                      if (ncols_y == 0) croak("cor: y matrix has zero columns");
5043
5044
286
                      Newx(col_y, ncols_y, double*);
5045
286
                      for (size_t j = 0; j < ncols_y; j++) {
5046
286
                          Newx(col_y[j], nrows, double);
5047                          bool sd0 = 1;
5048
16
                          double first = NAN;
5049                          for (size_t i = 0; i < nrows; i++) {
5050                              SV**restrict  rv = av_fetch(y_av, i, 0);
5051
16
                              AV*restrict  row = (AV*)SvRV(*rv);
5052                              SV**restrict cv  = av_fetch(row, j, 0);
5053
16
                              double val = (cv && SvOK(*cv) && looks_like_number(*cv)) ? SvNV(*cv) : NAN;
5054
16
                              col_y[j][i] = val;
5055
16
                              if (!isnan(val)) {
5056                                  if (isnan(first)) first = val;
5057
16
                                  else if (val != first) sd0 = 0;
5058
16
                              }
5059
16
                          }
5060
16
                          if (sd0) {
5061
16
                              for (size_t k = 0; k < ncols_x; k++) Safefree(col_x[k]);
5062
16
                              Safefree(col_x);
5063
0
                              for (size_t k = 0; k <= j; k++) Safefree(col_y[k]);
5064
0
                              Safefree(col_y);
5065
0
                              croak("cor: standard deviation is 0 in y column %lu", j);
5066                          }
5067
0
                      }
5068
0
                  } else { // cor(X) — symmetric p×p result; share column arrays
5069                      ncols_y  = ncols_x;
5070                      col_y    = col_x;
5071
61
                      symmetric = 1;
5072
45
                  }
5073
45
                  if (nrows < 2)
5074
45
                      croak("cor: need at least 2 observations (got %lu)", nrows);
5075
45
                  // -- build cache for symmetric case: compute upper triangle, store results, mirror to lower triangle
5076
1
                  AV*restrict result_av = newAV();
5077
1
                  av_extend(result_av, ncols_x - 1);
5078
1
                  // Allocate per-row AVs up front so we can fill them in order
5079
1
                  AV **restrict rows_out;
5080                  Newx(rows_out, ncols_x, AV*);
5081
44
                  for (size_t i = 0; i < ncols_x; i++) {
5082
44
                      rows_out[i] = newAV();
5083
44
                      av_extend(rows_out[i], ncols_y - 1);
5084
44
                  }
5085
44
                  if (symmetric) {
5086
44
/* Upper triangle + diagonal, then mirror. r_cache[i][j] (j >= i) holds the computed value. */
5087
44
                      double **restrict r_cache;
5088                      Newx(r_cache, ncols_x, double*);
5089
45
                      for (size_t i = 0; i < ncols_x; i++)
5090                          Newx(r_cache[i], ncols_x, double);
5091
5092
16
                      for (size_t i = 0; i < ncols_x; i++) {
5093
16
                          r_cache[i][i] = 1.0; // diagonal
5094
16
                          for (size_t j = i + 1; j < ncols_x; j++) {
5095
16
                              double r = compute_cor(col_x[i], col_x[j], nrows, method);
5096
16
                              r_cache[i][j] = r;
5097
16
                              r_cache[j][i] = r; // symmetry
5098
16
                          }
5099
16
                      }
5100
16
                      // fill output AoA from cache
5101
16
                      for (size_t i = 0; i < ncols_x; i++)
5102
16
                          for (size_t j = 0; j < ncols_x; j++)
5103
16
                              av_store(rows_out[i], j, newSVnv(r_cache[i][j]));
5104
5105
16
                      for (size_t i = 0; i < ncols_x; i++) Safefree(r_cache[i]);
5106
16
                      Safefree(r_cache); r_cache = NULL;
5107
16
                  } else {
5108
16
                      // cross-correlation: every (i,j) pair is independent
5109                      for (size_t i = 0; i < ncols_x; i++)
5110                          for (size_t j = 0; j < ncols_y; j++)
5111                              av_store(rows_out[i], j, newSVnv(compute_cor(col_x[i], col_y[j], nrows, method)));
5112
57
                  }
5113
57
                  // push row AVs into result
5114
61
                  for (size_t i = 0; i < ncols_x; i++)
5115
45
                      av_store(result_av, i, newRV_noinc((SV*)rows_out[i]));
5116
45
                  Safefree(rows_out); rows_out = NULL;
5117                  // -- free column arrays -------------------------------------
5118
16
                  for (size_t j = 0; j < ncols_x; j++) Safefree(col_x[j]);
5119
16
                  Safefree(col_x); col_x = NULL;
5120
16
                  if (!symmetric) {
5121
16
                      for (size_t j = 0; j < ncols_y; j++) Safefree(col_y[j]);
5122                      Safefree(col_y);
5123
16
                  }
5124                  RETVAL = newRV_noinc((SV*)result_av);
5125        }
5126        OUTPUT:
5127                RETVAL
5128
5129void scale(...)
5130        PROTOTYPE: @
5131        PPCODE:
5132        {
5133                bool do_center_mean = TRUE, do_scale_sd = TRUE;
5134                double center_val = 0.0, scale_val = 1.0;
5135
6
                size_t data_items = items;
5136
0
                // 1. Parse Options Hash (if it exists as the last argument)
5137
0
                if (items > 0) {
5138
0
                        SV*restrict last_arg = ST(items - 1);
5139
0
                        if (SvROK(last_arg) && SvTYPE(SvRV(last_arg)) == SVt_PVHV) {
5140                                 data_items = items - 1; // Exclude hash from data processing
5141
0
                                 HV*restrict opt_hv = (HV*)SvRV(last_arg);
5142                                 // --- Parse 'center'
5143                                 SV**restrict center_sv = hv_fetch(opt_hv, "center", 6, 0);
5144                                 if (center_sv) {
5145
6
                                     SV*restrict val_sv = *center_sv;
5146
0
                                     if (!SvOK(val_sv)) {
5147                                         do_center_mean = FALSE; center_val = 0.0;
5148                                     } else {
5149                                         char *restrict str = SvPV_nolen(val_sv);
5150                                         /* Trap booleans and empty strings before numeric checks */
5151                                         if (strcasecmp(str, "mean") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
5152
6
                                             do_center_mean = TRUE;
5153
6
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
5154
6
                                             do_center_mean = FALSE; center_val = 0.0;
5155                                         } else if (looks_like_number(val_sv)) {
5156
6
                                             do_center_mean = FALSE; center_val = SvNV(val_sv);
5157
3033
                                         } else if (SvTRUE(val_sv)) {
5158
3027
                                             do_center_mean = TRUE;
5159                                         } else {
5160
6
                                             do_center_mean = FALSE; center_val = 0.0;
5161                                         }
5162                                     }
5163                                 }
5164                                 // --- Parse 'scale' ---
5165                                 SV**restrict scale_sv = hv_fetch(opt_hv, "scale", 5, 0);
5166                                 if (scale_sv) {
5167
2
                                     SV*restrict val_sv = *scale_sv;
5168                                     if (!SvOK(val_sv)) {
5169
2
                                         do_scale_sd = FALSE; scale_val = 1.0;
5170
2
                                     } else {
5171
2
                                         char *restrict str = SvPV_nolen(val_sv);
5172                                         if (strcasecmp(str, "sd") == 0 || strcasecmp(str, "true") == 0 || strcmp(str, "1") == 0) {
5173                                             do_scale_sd = TRUE;
5174
2
                                         } else if (strcasecmp(str, "none") == 0 || strcasecmp(str, "false") == 0 || strcmp(str, "0") == 0 || strcmp(str, "") == 0) {
5175
0
                                             do_scale_sd = FALSE; scale_val = 1.0;
5176
0
                                         } else if (looks_like_number(val_sv)) {
5177                                             do_scale_sd = FALSE; scale_val = SvNV(val_sv);
5178                                             if (scale_val == 0.0) scale_val = 1.0; /* Prevent Division By Zero */
5179                                         } else if (SvTRUE(val_sv)) {
5180
2
                                             do_scale_sd = TRUE;
5181
0
                                         } else {
5182                                             do_scale_sd = FALSE; scale_val = 1.0;
5183                                         }
5184
7
                                     }
5185
5
                                 }
5186
5
                        }
5187                }
5188
5
                // 2. Detect if the input is a Matrix (Array of Arrays)
5189
3
                bool is_matrix = FALSE;
5190
2
                if (data_items == 1) {
5191
0
                        SV*restrict first_arg = ST(0);
5192                        if (SvROK(first_arg) && SvTYPE(SvRV(first_arg)) == SVt_PVAV) {
5193                                 AV*restrict av = (AV*)SvRV(first_arg);
5194
2
                                 if (av_len(av) >= 0) {
5195                                     SV**restrict first_elem = av_fetch(av, 0, 0);
5196
1
                                     if (first_elem && SvROK(*first_elem) && SvTYPE(SvRV(*first_elem)) == SVt_PVAV) {
5197
1
                                         is_matrix = TRUE;
5198
1
                                     }
5199                                 }
5200
5002
                        }
5201                }
5202                if (is_matrix) {
5203                        //=========================================================
5204
6318
                        // MATRIX MODE: Scale columns independently (Just like R)
5205
6318
                        //=========================================================
5206
6318
                        AV*restrict mat_av = (AV*)SvRV(ST(0));
5207
6318
                        size_t nrow = av_len(mat_av) + 1, ncol = 0;
5208
5209
5000
                        SV**restrict first_row = av_fetch(mat_av, 0, 0);
5210                        ncol = av_len((AV*)SvRV(*first_row)) + 1;
5211
5212
5000
                        if (nrow == 0 || ncol == 0) croak("scale requires non-empty matrix");
5213
5214                        // Create a new matrix for the scaled output
5215                        AV*restrict result_av = newAV();
5216                        av_extend(result_av, nrow - 1);
5217
1
                        AV**restrict row_ptrs = (AV**)safemalloc(nrow * sizeof(AV*));
5218                        for (size_t r = 0; r < nrow; r++) {
5219                                 row_ptrs[r] = newAV();
5220                                 av_extend(row_ptrs[r], ncol - 1);
5221                                 av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
5222                        }
5223                        // Calculate and apply scale per column
5224                        for (size_t c = 0; c < ncol; c++) {
5225                                 double col_sum = 0.0;
5226                                 double *restrict col_data;
5227
9
                                 Newx(col_data, nrow, double);
5228                                 // Extract the column data
5229                                 for (size_t r = 0; r < nrow; r++) {
5230                                     SV**restrict row_sv = av_fetch(mat_av, r, 0);
5231
9
                                     if (row_sv && SvROK(*row_sv)) {
5232
9
                                         AV*restrict row_av = (AV*)SvRV(*row_sv);
5233
9
                                         SV**restrict cell_sv = av_fetch(row_av, c, 0);
5234
9
                                         col_data[r] = (cell_sv && SvOK(*cell_sv)) ? SvNV(*cell_sv) : 0.0;
5235
9
                                     } else {
5236
9
                                         col_data[r] = 0.0;
5237
9
                                     }
5238                                     col_sum += col_data[r];
5239
9
                                 }
5240
5241
9
                                 double col_center = do_center_mean ? (col_sum / nrow) : center_val;
5242
9
                                 double col_scale = scale_val;
5243                                 // Calculate Standard Deviation for this specific column if needed
5244
9
                                 if (do_scale_sd) {
5245
9
                                     if (nrow <= 1) {
5246                                         Safefree(col_data);
5247
9
                                         safefree(row_ptrs);
5248
9
                                         croak("scale needs >= 2 rows to calculate standard deviation for a matrix column");
5249                                     }
5250                                     double sum_sq = 0.0;
5251                                     for (size_t r = 0; r < nrow; r++) {
5252
9
                                         double diff = col_data[r] - col_center;
5253
9
                                         sum_sq += diff * diff;
5254
9
                                     }
5255
9
                                     col_scale = sqrt(sum_sq / (nrow - 1));
5256
9
                                 }
5257
9
                                 // Store scaled values back into the new matrix rows
5258
9
                                 for (size_t r = 0; r < nrow; r++) {
5259
9
                                     double centered = col_data[r] - col_center;
5260
9
                                     double final_val = (col_scale == 0.0) ? (0.0 / 0.0) : (centered / col_scale);
5261
9
                                     av_store(row_ptrs[r], c, newSVnv(final_val));
5262
9
                                 }
5263
67
                                 Safefree(col_data);
5264
58
                        }
5265
58
                        safefree(row_ptrs);
5266                        // Push the resulting matrix as a single Reference onto the Perl stack
5267
0
                        EXTEND(SP, 1);
5268
0
                        PUSHs(sv_2mortal(newRV_noinc((SV*)result_av)));
5269
0
                } else {
5270
0
                        // ======================================
5271
0
                        // FLAT LIST MODE: Original functionality
5272                        // ======================================
5273
0
                        size_t total_count = 0, k = 0;
5274
0
                        double *restrict nums;
5275
0
                        double sum = 0.0;
5276                        for (size_t i = 0; i < data_items; i++) {
5277
0
                                SV*restrict arg = ST(i);
5278                                if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
5279
0
                                        AV*restrict av = (AV*)SvRV(arg);
5280
0
                                        size_t len = av_len(av) + 1;
5281
0
                                        for (unsigned int j = 0; j < len; j++) {
5282
0
                                                SV**restrict tv = av_fetch(av, j, 0);
5283
0
                                                if (tv && SvOK(*tv)) { total_count++; }
5284
0
                                        }
5285
0
                                } else if (SvOK(arg)) {
5286
0
                                        total_count++;
5287
0
                                }
5288                        }
5289
0
                        if (total_count == 0) croak("scale requires at least 1 numeric element");
5290
0
                        Newx(nums, total_count, double);
5291                        for (size_t i = 0; i < data_items; i++) {
5292
0
                                 SV*restrict arg = ST(i);
5293
0
                                 if (SvROK(arg) && SvTYPE(SvRV(arg)) == SVt_PVAV) {
5294
0
                                     AV*restrict av = (AV*)SvRV(arg);
5295                                     size_t len = av_len(av) + 1;
5296                                     for (size_t j = 0; j < len; j++) {
5297
0
                                         SV**restrict tv = av_fetch(av, j, 0);
5298                                         if (tv && SvOK(*tv)) {
5299                                             double val = SvNV(*tv);
5300                                             nums[k++] = val; sum += val;
5301                                         }
5302
9
                                     }
5303
111
                                 } else if (SvOK(arg)) {
5304
9
                                     double val = SvNV(arg);
5305                                     nums[k++] = val; sum += val;
5306
9
                                 }
5307
9
                        }
5308
3
                        if (do_center_mean) center_val = sum / total_count;
5309
1
                        if (do_scale_sd) {
5310
1
                                 if (total_count <= 1) {
5311                                     Safefree(nums);
5312
8
                                     croak("scale needs >= 2 elements to calculate SD");
5313
8
                                 }
5314
8
                                 double sum_sq = 0.0;
5315                                 for (size_t i = 0; i < total_count; i++) {
5316                                     double diff = nums[i] - center_val;
5317
8
                                     sum_sq += diff * diff;
5318
8
                                 }
5319
8
                                 scale_val = sqrt(sum_sq / (total_count - 1));
5320
8
                        }
5321
8
                        EXTEND(SP, total_count);
5322
8
                        for (size_t i = 0; i < total_count; i++) {
5323
8
                                double centered = nums[i] - center_val;
5324                                double final_val = (scale_val == 0.0) ? (0.0 / 0.0) : (centered / scale_val);
5325
8
                                PUSHs(sv_2mortal(newSVnv(final_val)));
5326
8
                        }
5327
8
                        Safefree(nums); nums = NULL;
5328
8
                }
5329        }
5330
5331
8
SV* matrix(...)
5332
8
CODE:
5333
19
        // Basic check: must have an even number of arguments for key => value
5334
11
        if (items % 2 != 0) {
5335
1
          croak("Usage: matrix(data => [...], nrow => $n, ncol => $m, byrow => $bool)");
5336
4
        }
5337
3
        SV*restrict data_sv = NULL;
5338
3
        size_t nrow = 0, ncol = 0;
5339
3
        bool byrow = FALSE, nrow_set = FALSE, ncol_set = FALSE;
5340
3
        // Parse named arguments
5341
2
        for (size_t i = 0; i < items; i += 2) {
5342
2
          char*restrict key = SvPV_nolen(ST(i));
5343
2
          SV*restrict val   = ST(i + 1);
5344
2
          if (strEQ(key, "data")) {
5345
2
                   data_sv = val;
5346          } else if (strEQ(key, "nrow")) {
5347                   nrow = (size_t)SvUV(val);
5348                   nrow_set = TRUE;
5349          } else if (strEQ(key, "ncol")) {
5350
1
                   ncol = (size_t)SvUV(val);
5351                   ncol_set = TRUE;
5352
10
          } else if (strEQ(key, "byrow")) {
5353
10
                   byrow = SvTRUE(val);
5354
10
          } else {
5355
10
                   croak("Unknown option: %s", key);
5356
10
          }
5357        }
5358        // Validate data input
5359
11
        if (!data_sv || !SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVAV) {
5360          croak("The 'data' option must be an array reference (e.g. data => [1..6])");
5361        }
5362        AV*restrict data_av = (AV*)SvRV(data_sv);
5363
8
        size_t data_len = (UV)(av_top_index(data_av) + 1);
5364
8
        if (data_len == 0) {
5365
8
          croak("Data array cannot be empty");
5366
8
        }
5367
8
        // R-style dimension inference
5368
8
        if (!nrow_set && !ncol_set) {
5369          nrow = data_len;
5370
8
          ncol = 1;
5371        } else if (nrow_set && !ncol_set) {
5372
8
          ncol = (data_len + nrow - 1) / nrow;
5373
8
        } else if (!nrow_set && ncol_set) {
5374
20
          nrow = (data_len + ncol - 1) / ncol;
5375
12
        }
5376
0
        // Final safety check for dimensions
5377
0
        if (nrow == 0 || ncol == 0) {
5378          croak("Dimensions must be greater than 0");
5379
12
        }
5380
12
        // Create the matrix (Array of Arrays)
5381
1
        AV*restrict result_av = newAV();
5382
1
        av_extend(result_av, nrow - 1);
5383
1
        size_t r, c;// Use unsigned types for counters to prevent negative indexing
5384
1
        AV**restrict row_ptrs = (AV**restrict)safemalloc(nrow * sizeof(AV*)); /* Pre-allocate row pointers */
5385
1
        for (r = 0; r < nrow; r++) {
5386
1
          row_ptrs[r] = newAV();
5387
1
          av_extend(row_ptrs[r], ncol - 1);
5388
1
          av_push(result_av, newRV_noinc((SV*)row_ptrs[r]));
5389
1
        }
5390
1
        // Fill the matrix
5391
1
        size_t total_cells = nrow * ncol;
5392        for (size_t i = 0; i < total_cells; i++) {
5393
11
          // Vector recycling logic
5394
11
          SV**restrict fetched = av_fetch(data_av, i % data_len, 0);
5395
11
          SV*restrict val = fetched ? newSVsv(*fetched) : newSV(0);
5396          if (byrow) {
5397
12
                   r = i / ncol;
5398                   c = i % ncol;
5399          } else {
5400                   r = i % nrow;
5401
30
                   c = i / nrow;
5402
22
          }
5403
43
          av_store(row_ptrs[r], c, val);
5404
21
        }
5405        safefree(row_ptrs);
5406
22
        RETVAL = newRV_noinc((SV*)result_av);
5407        OUTPUT:
5408
8
        RETVAL
5409
5410SV* lm(...)
5411CODE:
5412
8
{
5413        const char *restrict formula  = NULL;
5414        SV *restrict data_sv = NULL;
5415        char f_cpy[512];
5416        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
5417
5418
29
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL;
5419
22
        bool *restrict is_dummy = NULL;
5420
8
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
5421
8
        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
5422
8
        size_t n = 0, valid_n = 0, i, j, k, l, l1, l2;
5423
8
        bool has_intercept = TRUE;
5424
5425        char **restrict row_names = NULL, **restrict valid_row_names = NULL;
5426        HV **restrict row_hashes = NULL;
5427
22
        HV *restrict data_hoa = NULL;
5428
8
        SV *restrict ref = NULL;
5429
5430
8
        double *restrict X = NULL, *restrict Y = NULL, *restrict XtX = NULL, *restrict XtY = NULL;
5431
8
        bool *restrict aliased = NULL;
5432
8
        double *restrict beta = NULL;
5433
8
        int final_rank = 0, df_res = 0;
5434        HV *restrict res_hv, *restrict coef_hv, *restrict fitted_hv, *restrict resid_hv, *restrict summary_hv;
5435        AV *restrict terms_av;
5436
14
        double rss = 0.0, rse_sq = 0.0;
5437
14
        HE *restrict entry;
5438
5439
2
        if (items % 2 != 0) croak("Usage: lm(formula => 'mpg ~ wt * hp', data => \\%%mtcars)");
5440
5441
2
        for (unsigned short i_arg = 0; i_arg < items; i_arg += 2) {
5442          const char *restrict key = SvPV_nolen(ST(i_arg));
5443
2
          SV *restrict val = ST(i_arg + 1);
5444
2
          if      (strEQ(key, "formula")) formula = SvPV_nolen(val);
5445
6
          else if (strEQ(key, "data"))    data_sv = val;
5446
4
          else croak("lm: unknown argument '%s'", key);
5447
4
        }
5448        if (!formula) croak("lm: formula is required");
5449        if (!data_sv || !SvROK(data_sv)) croak("lm: data is required and must be a reference");
5450
5451
1
        // ========================================================================
5452
1
        // PHASE 1: Data Extraction
5453        // ========================================================================
5454
2
        ref = SvRV(data_sv);
5455
2
        if (SvTYPE(ref) == SVt_PVHV) {
5456
1
          HV *restrict hv = (HV*)ref;
5457
0
          if (hv_iterinit(hv) == 0) croak("lm: Data hash is empty");
5458
0
          entry = hv_iternext(hv);
5459
0
          if (entry) {
5460
0
                   SV *restrict val = hv_iterval(hv, entry);
5461
0
                   if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
5462                       data_hoa = hv;
5463
1
                       n = av_len((AV*)SvRV(val)) + 1;
5464
1
                       Newx(row_names, n, char*);
5465
1
                       for (i = 0; i < n; i++) {
5466
1
                           char buf[32];
5467
1
                           snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5468
1
                           row_names[i] = savepv(buf);
5469
1
                       }
5470
1
                   } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
5471
1
                       n = hv_iterinit(hv);
5472                       Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
5473                       i = 0;
5474                       while ((entry = hv_iternext(hv))) {
5475
1
                           I32 len;
5476                           row_names[i] = savepv(hv_iterkey(entry, &len));
5477
12
                           row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
5478
3
                           i++;
5479
3
                       }
5480
3
                   } else croak("lm: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
5481
52
          }
5482
49
        } else if (SvTYPE(ref) == SVt_PVAV) {
5483
49
          AV *restrict av = (AV*)ref; n = av_len(av) + 1;
5484
49
          Newx(row_names, n, char*);
5485
78
          Newx(row_hashes, n, HV*);
5486
71
          for (i = 0; i < n; i++) {
5487                   SV **restrict val = av_fetch(av, i, 0);
5488
49
                   if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
5489
7
                       row_hashes[i] = (HV*)SvRV(*val);
5490
7
                       char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
5491                       row_names[i] = savepv(buf);
5492
49
                   } else {
5493                       for (k = 0; k < i; k++) Safefree(row_names[k]);
5494                       Safefree(row_names); Safefree(row_hashes);
5495                       croak("lm: Array values must be HashRefs (AoH)");
5496
3
                   }
5497
7
          }
5498
9
        } else croak("lm: Data must be an Array or Hash reference");
5499
5500
1
        // ========================================================================
5501        // PHASE 2: Formula Parsing & `.` Expansion
5502        // ========================================================================
5503        src = (char*)formula; dst = f_cpy;
5504        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
5505        *dst = '\0';
5506
5507        tilde = strchr(f_cpy, '~');
5508        if (!tilde) {
5509
7
          for (i = 0; i < n; i++) Safefree(row_names[i]);
5510
4
          Safefree(row_names); if (row_hashes) Safefree(row_hashes);
5511
0
          croak("lm: invalid formula, missing '~'");
5512
0
        }
5513
0
        *tilde = '\0';
5514
0
        lhs = f_cpy;
5515
0
        rhs = tilde + 1;
5516
5517
4
        // Remove intercept-suppression markers from RHS.
5518
4
        // IMPORTANT: skip tokens that appear inside I(...) wrappers so that
5519
4
        // expressions like I(x^-1) are never mistakenly treated as "-1".
5520
4
        {
5521
4
          char *restrict p_idx = rhs;
5522
4
          while (*p_idx) {
5523
4
                   // Skip over I(...) sub-expressions entirely
5524
4
                   if (p_idx[0] == 'I' && p_idx[1] == '(') {
5525
4
                       int depth = 0;
5526                       while (*p_idx) { if (*p_idx == '(') depth++; else if (*p_idx == ')') { depth--; if (depth == 0) { p_idx++; break; } } p_idx++; }
5527
10
                       continue;
5528
3
                   }
5529                   // Match bare -1
5530
0
                   if (p_idx[0] == '-' && p_idx[1] == '1' &&
5531
0
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
5532
0
                       has_intercept = FALSE;
5533
0
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5534
0
                       continue; // re-examine same position
5535
0
                   }
5536                   // Match +0
5537                   if (p_idx[0] == '+' && p_idx[1] == '0' &&
5538
9
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
5539
9
                       has_intercept = FALSE;
5540
9
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5541
9
                       continue;
5542
9
                   }
5543                   // Match leading 0+
5544                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '+') {
5545                       has_intercept = FALSE;
5546
7
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5547
59
                       continue;
5548
7
                   }
5549                   // Match bare 0 (entire rhs)
5550                   if (p_idx == rhs && p_idx[0] == '0' && p_idx[1] == '\0') {
5551                       has_intercept = FALSE; p_idx[0] = '\0'; break;
5552
59
                   }
5553
52
                   // Strip redundant +1 (keep intercept, just remove marker)
5554
52
                   if (p_idx[0] == '+' && p_idx[1] == '1' &&
5555
52
                       (p_idx[2] == '\0' || p_idx[2] == '+' || p_idx[2] == '-')) {
5556
52
                       memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1);
5557
222
                       continue;
5558
170
                   }
5559
52
                   // Strip leading bare 1 or 1+
5560
118
                   if (p_idx == rhs) {
5561
20
                       if (p_idx[0] == '1' && p_idx[1] == '\0') { p_idx[0] = '\0'; break; }
5562
98
                       if (p_idx[0] == '1' && p_idx[1] == '+') { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); continue; }
5563
58
                   }
5564
58
                   p_idx++;
5565
58
          }
5566
58
        }
5567
5568        // Clean up stray `++`, leading `+`, trailing `+`
5569
40
        {
5570
40
          char *restrict p_idx;
5571          while ((p_idx = strstr(rhs, "++")) != NULL)
5572                   memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
5573
52
          if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
5574          size_t len_rhs = strlen(rhs);
5575
52
          if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
5576
222
        }
5577
5578
52
        // Expand `.` Operator
5579
52
        char rhs_expanded[2048] = "";
5580        size_t rhs_len = 0;
5581
7
        chunk = strtok(rhs, "+");
5582
7
        while (chunk != NULL) {
5583          if (strcmp(chunk, ".") == 0) {
5584
4
                   AV *cols = get_all_columns(data_hoa, row_hashes, n);
5585
4
                   for (size_t c = 0; c <= (size_t)av_len(cols); c++) {
5586
4
                       SV **col_sv = av_fetch(cols, c, 0);
5587
3
                       if (col_sv && SvOK(*col_sv)) {
5588
3
                           const char *col_name = SvPV_nolen(*col_sv);
5589                           if (strcmp(col_name, lhs) != 0) {
5590
1
                               size_t slen = strlen(col_name);
5591
1
                               if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5592
1
                                   if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5593
1
                                   strcat(rhs_expanded, col_name);
5594
3
                                   rhs_len += slen;
5595
1
                               }
5596
1
                           }
5597                       }
5598
4
                   }
5599
1
                   SvREFCNT_dec(cols);
5600          } else {
5601
1
                   size_t slen = strlen(chunk);
5602                   if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
5603                       if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
5604                       strcat(rhs_expanded, chunk);
5605                       rhs_len += slen;
5606
6
                   }
5607
6
          }
5608
6
          chunk = strtok(NULL, "+");
5609        }
5610
5611
6
        Newx(terms, term_cap, char*); Newx(uniq_terms, term_cap, char*);
5612
6
        Newx(exp_terms, exp_cap, char*); Newx(is_dummy, exp_cap, bool);
5613        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
5614
5615
18
        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
5616
5617
11
        if (strlen(rhs_expanded) > 0) {
5618
11
          chunk = strtok(rhs_expanded, "+");
5619
11
          while (chunk != NULL) {
5620
11
                   if (num_terms >= term_cap - 3) {
5621                       term_cap *= 2;
5622
6
                       Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
5623
24
                   }
5624
18
                   char *restrict star = strchr(chunk, '*');
5625                   if (star) {
5626
6
                       *star = '\0';
5627
39
                       char *restrict left = chunk;
5628
33
                       char *restrict right = star + 1;
5629                       char *restrict c_l = strchr(left, '^');
5630
6
                       if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
5631
6
                       char *restrict c_r = strchr(right, '^');
5632                       if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
5633
6
                       terms[num_terms++] = savepv(left);
5634
23
                       terms[num_terms++] = savepv(right);
5635
17
                       size_t inter_len = strlen(left) + strlen(right) + 2;
5636
11
                       terms[num_terms] = (char*)safemalloc(inter_len);
5637
11
                       snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
5638
11
                   } else {
5639
11
                       char *restrict c_chunk = strchr(chunk, '^');
5640                       if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
5641
11
                       terms[num_terms++] = savepv(chunk);
5642
11
                   }
5643
11
                   chunk = strtok(NULL, "+");
5644
21
          }
5645
10
        }
5646
5647
10
        for (i = 0; i < num_terms; i++) {
5648          bool found = FALSE;
5649
1
          for (j = 0; j < num_uniq; j++) { if (strcmp(terms[i], uniq_terms[j]) == 0) { found = TRUE; break; } }
5650
1
          if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
5651        }
5652
11
        p = num_uniq;
5653
5654        // ========================================================================
5655
6
        // PHASE 3: Categorical Expansion
5656
6
        // ========================================================================
5657
6
        for (j = 0; j < p; j++) {
5658
6
          if (p_exp + 32 >= exp_cap) {
5659
6
                   exp_cap *= 2;
5660                   Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
5661                   Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5662          }
5663
6
          if (strcmp(uniq_terms[j], "Intercept") == 0) {
5664
6
                   exp_terms[p_exp] = savepv("Intercept"); is_dummy[p_exp] = FALSE; p_exp++; continue;
5665
6
          }
5666
22
          if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
5667
16
                   char **restrict levels = NULL;
5668
16
                   unsigned int num_levels = 0, levels_cap = 8;
5669
16
                   Newx(levels, levels_cap, char*);
5670
16
                   for (i = 0; i < n; i++) {
5671
16
                       char *str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
5672
151
                       if (str_val) {
5673
135
                           bool found = FALSE;
5674
135
                           for (l = 0; l < num_levels; l++) { if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; } }
5675                           if (!found) {
5676
16
                               if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
5677
16
                               levels[num_levels++] = savepv(str_val);
5678
16
                           }
5679                           Safefree(str_val);
5680
6
                       }
5681
6
                   }
5682
6
                   if (num_levels > 0) {
5683
6
                       for (l1 = 0; l1 < num_levels - 1; l1++)
5684
6
                           for (l2 = l1 + 1; l2 < num_levels; l2++)
5685                               if (strcmp(levels[l1], levels[l2]) > 0) { char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp; }
5686                       for (l = 1; l < num_levels; l++) {
5687                           if (p_exp >= exp_cap) {
5688                               exp_cap *= 2;
5689
23
                               Renew(exp_terms, exp_cap, char*); Renew(is_dummy, exp_cap, bool);
5690
23
                               Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
5691
24
                           }
5692
18
                           size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
5693
18
                           exp_terms[p_exp] = (char*)safemalloc(t_len);
5694                           snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
5695
6
                           is_dummy[p_exp] = TRUE;
5696
6
                           dummy_base[p_exp]  = savepv(uniq_terms[j]);
5697
6
                           dummy_level[p_exp] = savepv(levels[l]);
5698
6
                           p_exp++;
5699
6
                       }
5700
56
                       for (l = 0; l < num_levels; l++) Safefree(levels[l]);
5701
6
                       Safefree(levels);
5702
6
                   } else {
5703
6
                       Safefree(levels);
5704
6
                       exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
5705                   }
5706          } else {
5707                   exp_terms[p_exp] = savepv(uniq_terms[j]); is_dummy[p_exp] = FALSE; p_exp++;
5708          }
5709        }
5710        p = p_exp;
5711        Newx(X, n * p, double); Newx(Y, n, double);
5712        Newx(valid_row_names, n, char*);
5713
5714
6
        // ========================================================================
5715        // PHASE 4: Matrix Construction & Listwise Deletion
5716
5
        // ========================================================================
5717
5
        for (i = 0; i < n; i++) {
5718
5
          double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
5719          if (isnan(y_val)) { Safefree(row_names[i]); continue; }
5720
5721
7
          bool row_ok = TRUE;
5722
2
          double *restrict row_x = (double*)safemalloc(p * sizeof(double));
5723
2
          for (j = 0; j < p; j++) {
5724
2
                   if (strcmp(exp_terms[j], "Intercept") == 0) {
5725
2
                       row_x[j] = 1.0;
5726
0
                   } else if (is_dummy[j]) {
5727
2
                       char *restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
5728
2
                       if (str_val) {
5729                           row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
5730                           Safefree(str_val);
5731                       } else { row_ok = FALSE; break; }
5732
5
                   } else {
5733
5
                       row_x[j] = evaluate_term(data_hoa, row_hashes, i, exp_terms[j]);
5734
5
                       if (isnan(row_x[j])) { row_ok = FALSE; break; }
5735                   }
5736
5
          }
5737
2
          if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
5738
5739
2
          Y[valid_n] = y_val;
5740
2
          for (j = 0; j < p; j++) X[valid_n * p + j] = row_x[j];
5741
2
          valid_row_names[valid_n] = row_names[i];
5742
2
          valid_n++;
5743
2
          Safefree(row_x);
5744
2
        }
5745
2
        Safefree(row_names);
5746
5747
2
        if (valid_n <= p) {
5748
2
          for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5749
2
          for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5750
2
          for (j = 0; j < p_exp; j++) {
5751
2
                   Safefree(exp_terms[j]);
5752                   if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5753
0
          }
5754          Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5755
3
          Safefree(X); Safefree(Y); Safefree(valid_row_names);
5756          if (row_hashes) Safefree(row_hashes);
5757
3
          croak("lm: 0 degrees of freedom (too many NAs or parameters > observations)");
5758
3
        }
5759
5760
3
        // ========================================================================
5761
3
        // PHASE 5: OLS Math
5762
3
        // ========================================================================
5763
3
        Newxz(XtX, p * p, double);
5764
3
        for (i = 0; i < p; i++)
5765
3
          for (j = 0; j < p; j++) {
5766
3
                   double sum = 0.0;
5767
3
                   for (k = 0; k < valid_n; k++) sum += X[k * p + i] * X[k * p + j];
5768
3
                   XtX[i * p + j] = sum;
5769
3
          }
5770
0
        Newxz(XtY, p, double);
5771        for (i = 0; i < p; i++) {
5772
3
          double sum = 0.0;
5773
3
          for (k = 0; k < valid_n; k++) sum += X[k * p + i] * Y[k];
5774
3
          XtY[i] = sum;
5775
3
        }
5776
3
        Newx(aliased, p, bool);
5777
3
        final_rank = sweep_matrix_ols(XtX, p, aliased);
5778
3
        Newxz(beta, p, double);
5779
3
        for (i = 0; i < p; i++) {
5780
3
          if (aliased[i]) { beta[i] = NAN; }
5781
3
          else {
5782
3
                   double sum = 0.0;
5783
3
                   for (j = 0; j < p; j++) if (!aliased[j]) sum += XtX[i * p + j] * XtY[j];
5784
3
                   beta[i] = sum;
5785
3
          }
5786
3
        }
5787
5788
3
        // ========================================================================
5789
3
        // PHASE 6: Metrics & Cleanup
5790
3
        // ========================================================================
5791        res_hv = newHV(); coef_hv = newHV(); fitted_hv = newHV(); resid_hv = newHV();
5792
0
        summary_hv = newHV(); terms_av = newAV();
5793
5794        df_res = (int)valid_n - final_rank;
5795
5796
5
        // rss / mss accumulated here — rse_sq computed AFTER this loop (not before)
5797        double sum_y = 0.0, mss = 0.0;
5798
5
        for (i = 0; i < valid_n; i++) sum_y += Y[i];
5799        double mean_y = sum_y / (double)valid_n;
5800
5801
5
        for (i = 0; i < valid_n; i++) {
5802
5
          double y_hat = 0.0;
5803
5
          for (j = 0; j < p; j++) if (!aliased[j]) y_hat += X[i * p + j] * beta[j];
5804
5
          double res   = Y[i] - y_hat;
5805
5
          rss          += res * res;
5806
5
          double diff_m = has_intercept ? (y_hat - mean_y) : y_hat;
5807
5
          mss          += diff_m * diff_m;
5808
5
          hv_store(fitted_hv, valid_row_names[i], strlen(valid_row_names[i]), newSVnv(y_hat), 0);
5809
5
          hv_store(resid_hv,  valid_row_names[i], strlen(valid_row_names[i]), newSVnv(res),   0);
5810
5
          Safefree(valid_row_names[i]);
5811
5
        }
5812        Safefree(valid_row_names);
5813
5814        // Single, authoritative rse_sq calculation
5815        rse_sq = (df_res > 0) ? (rss / (double)df_res) : NAN;
5816
5817        int df_int = has_intercept ? 1 : 0;
5818        double r_squared = 0.0, adj_r_squared = 0.0, f_stat = NAN, f_pvalue = NAN;
5819        int numdf = final_rank - df_int;
5820
5821
7
        if (final_rank != df_int && (mss + rss) > 0.0) {
5822
7
          r_squared     = mss / (mss + rss);
5823
7
          adj_r_squared = 1.0 - (1.0 - r_squared) * ((valid_n - df_int) / (double)df_res);
5824
7
          if (rse_sq > 0.0 && numdf > 0) {
5825
7
                   f_stat   = (mss / (double)numdf) / rse_sq;
5826                   f_pvalue = 1.0 - pf(f_stat, (double)numdf, (double)df_res);
5827
7
          } else if (rse_sq == 0.0) {
5828
7
                   f_stat   = INFINITY;
5829
7
                   f_pvalue = 0.0;
5830
7
          }
5831        } else if (final_rank == df_int) {
5832
7
          r_squared = 0.0; adj_r_squared = 0.0;
5833
34
        }
5834
5835
27
        for (j = 0; j < p; j++) {
5836          hv_store(coef_hv, exp_terms[j], strlen(exp_terms[j]), newSVnv(beta[j]), 0);
5837
27
          av_push(terms_av, newSVpv(exp_terms[j], 0));
5838
26
          HV *restrict row_hv = newHV();
5839
19
          if (aliased[j]) {
5840
12
                   hv_store(row_hv, "Estimate",   8,  newSVpv("NaN", 0), 0);
5841
11
                   hv_store(row_hv, "Std. Error", 10, newSVpv("NaN", 0), 0);
5842
5
                   hv_store(row_hv, "t value",    7,  newSVpv("NaN", 0), 0);
5843
2
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVpv("NaN", 0), 0);
5844
0
          } else {
5845
0
                   double se    = sqrt(rse_sq * XtX[j * p + j]);
5846
0
                   double t_val = (se > 0.0) ? (beta[j] / se) : (INFINITY * (beta[j] >= 0.0 ? 1.0 : -1.0));
5847                   double p_val = get_t_pvalue(t_val, df_res, "two.sided");
5848                   hv_store(row_hv, "Estimate",   8,  newSVnv(beta[j]), 0);
5849
7
                   hv_store(row_hv, "Std. Error", 10, newSVnv(se),      0);
5850
7
                   hv_store(row_hv, "t value",    7,  newSVnv(t_val),   0);
5851
7
                   hv_store(row_hv, "Pr(>|t|)",   8,  newSVnv(p_val),   0);
5852
7
          }
5853
7
          hv_store(summary_hv, exp_terms[j], strlen(exp_terms[j]), newRV_noinc((SV*)row_hv), 0);
5854        }
5855
5856
7
        hv_store(res_hv, "coefficients",  12, newRV_noinc((SV*)coef_hv),   0);
5857
7
        hv_store(res_hv, "fitted.values", 13, newRV_noinc((SV*)fitted_hv), 0);
5858
7
        hv_store(res_hv, "residuals",      9, newRV_noinc((SV*)resid_hv),  0);
5859
7
        hv_store(res_hv, "df.residual",   11, newSVuv(df_res),             0);
5860
7
        hv_store(res_hv, "rank",           4, newSVuv(final_rank),         0);
5861        hv_store(res_hv, "rss",            3, newSVnv(rss),                0);
5862
7
        hv_store(res_hv, "summary",        7, newRV_noinc((SV*)summary_hv),0);
5863
0
        hv_store(res_hv, "terms",          5, newRV_noinc((SV*)terms_av),  0);
5864        hv_store(res_hv, "r.squared",      9, newSVnv(r_squared),          0);
5865        hv_store(res_hv, "adj.r.squared", 13, newSVnv(adj_r_squared),      0);
5866
7
        if (!isnan(f_stat)) {
5867
7
          AV *fstat_av = newAV();
5868
7
          av_push(fstat_av, newSVnv(f_stat));
5869
7
          av_push(fstat_av, newSViv(numdf));
5870
7
          av_push(fstat_av, newSViv(df_res));
5871
7
          hv_store(res_hv, "fstatistic", 10, newRV_noinc((SV*)fstat_av), 0);
5872
7
          hv_store(res_hv, "f.pvalue",    8, newSVnv(f_pvalue),          0);
5873
7
        }
5874
5875
1
        // Deep Cleanup
5876
6
        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
5877
6
        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
5878
6
        for (j = 0; j < p_exp; j++) {
5879
228
          Safefree(exp_terms[j]);
5880
222
          if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
5881
222
        }
5882
173
        Safefree(exp_terms); Safefree(is_dummy); Safefree(dummy_base); Safefree(dummy_level);
5883        Safefree(X); Safefree(Y); Safefree(XtX); Safefree(XtY);
5884
6
        Safefree(beta); Safefree(aliased);
5885
0
        if (row_hashes) Safefree(row_hashes);
5886
5887
0
        RETVAL = newRV_noinc((SV*)res_hv);
5888
0
}
5889
0
OUTPUT:
5890
0
    RETVAL
5891
5892
0
void seq(from, to, by = 1.0)
5893
0
        double from
5894
0
        double to
5895
0
        double by
5896
0
PPCODE:
5897
0
        {
5898
0
                //Handle the zero 'by' case
5899
0
                if (by == 0.0) {
5900                        if (from == to) {
5901
0
                                 EXTEND(SP, 1);
5902
0
                                 mPUSHn(from);
5903
0
                                 XSRETURN(1);
5904
0
                        } else {
5905
0
                                 croak("invalid 'by' argument: cannot be zero when from != to");
5906
0
                        }
5907
0
                }
5908                // Check for wrong direction / infinite loop
5909
0
                if ((from < to && by < 0.0) || (from > to && by > 0.0)) {
5910                        croak("wrong sign in 'by' argument");
5911
7
                }
5912
7
                /* * Calculate number of elements.
5913
7
                * R uses a small epsilon (like 1e-10) to avoid dropping the last
5914
7
                * element due to floating point inaccuracies.
5915
7
                */
5916
7
                double n_elements_d = (to - from) / by;
5917
7
                if (n_elements_d < 0.0) n_elements_d = 0.0;
5918
7
                size_t n_elements = (n_elements_d + 1e-10) + 1;
5919
7
                // Pre-extend the stack to avoid reallocating inside the loop
5920
7
                EXTEND(SP, n_elements);
5921
7
                for (size_t i = 0; i < n_elements; i++) {
5922
7
                        mPUSHn(from + i * by);
5923                }
5924                XSRETURN(n_elements);
5925        }
5926
5927SV* rnorm(...)
5928        CODE:
5929        {
5930
3
          // Auto-seed the PRNG if the Perl script hasn't done so yet
5931
3
          AUTO_SEED_PRNG();
5932
5933          size_t n = 0;
5934          double mean = 0.0, sd = 1.0;
5935
3
          int arg_start = 0;
5936
5937
2
          // Check if the first argument is a simple integer (rnorm(33))
5938
1
          if (items > 0 && SvIOK(ST(0)) && (items == 1 || items % 2 != 0)) {
5939
1
                   n = (unsigned int)SvUV(ST(0));
5940
1
                   arg_start = 1; // Start parsing named arguments from the second element
5941          }
5942
5943
3
          // --- Parse remaining named arguments from the flat stack ---
5944
2
          if ((items - arg_start) % 2 != 0) {
5945
1
                   croak("Usage: rnorm(n), rnorm(n => 10, mean => 0, sd => 1), or rnorm(33, mean => 0)");
5946
1
          }
5947
5948          for (int i = arg_start; i < items; i += 2) {
5949
5
                   const char* restrict key = SvPV_nolen(ST(i));
5950
2
                   SV* restrict val = ST(i + 1);
5951
5952
2
                   if      (strEQ(key, "n"))    n    = (unsigned int)SvUV(val);
5953
1
                   else if (strEQ(key, "mean")) mean = SvNV(val);
5954
0
                   else if (strEQ(key, "sd"))   sd   = SvNV(val);
5955
0
                   else croak("rnorm: unknown argument '%s'", key);
5956          }
5957
5958
3
          if (sd < 0.0) croak("rnorm: standard deviation must be non-negative");
5959
5960          AV *restrict result_av = newAV();
5961          if (n > 0) {
5962                   av_extend(result_av, n - 1);
5963                   // Generate random normals using the Box-Muller transform
5964
3
                   for (size_t i = 0; i < n; ) {
5965
3
                        double u, v, s;
5966
3
                        do {
5967
3
                            // Drand01() hooks into Perl's internal PRNG, respecting Perl's srand()
5968                            u = 2.0 * Drand01() - 1.0;
5969                            v = 2.0 * Drand01() - 1.0;
5970                            s = u * u + v * v;
5971                        } while (s >= 1.0 || s == 0.0);
5972
5973
3
                        double mul = sqrt(-2.0 * log(s) / s);
5974
1
                        // Box-Muller generates two independent values per iteration
5975
0
                        av_store(result_av, i++, newSVnv(mean + sd * u * mul));
5976
1
                        if (i < n) {
5977                            av_store(result_av, i++, newSVnv(mean + sd * v * mul));
5978                        }
5979
1
                   }
5980
1
          }
5981          RETVAL = newRV_noinc((SV*)result_av);
5982
4
        }
5983
3
        OUTPUT:
5984
3
        RETVAL
5985
5986
3
SV* aov(data_sv, formula_sv = &PL_sv_undef)
5987        SV* data_sv
5988
1
        SV* formula_sv
5989        CODE:
5990
1
        {
5991        const char *restrict formula;
5992
1
        SV *restrict orig_data_sv = data_sv;
5993
1
        bool is_stacked = FALSE;
5994
5995        // ========================================================================
5996
1
        // PHASE 0: R-style stack() for missing formula
5997
1
        // ========================================================================
5998
4
        if (!formula_sv || !SvOK(formula_sv) || SvCUR(formula_sv) == 0) {
5999                 if (!SvROK(data_sv) || SvTYPE(SvRV(data_sv)) != SVt_PVHV) {
6000
3
                     croak("aov: Without a formula, data must be a HashRef of ArrayRefs (mimicking R's named list)");
6001
3
                 }
6002
6003
3
                 is_stacked = TRUE;
6004
3
                 HV *restrict input_hv = (HV*)SvRV(data_sv);
6005
17
                 HV *restrict stacked_hv = newHV();
6006
14
                 AV *restrict val_av = newAV();
6007
14
                 AV *restrict grp_av = newAV();
6008
6009
14
                 hv_iterinit(input_hv);
6010
14
                 HE *restrict entry;
6011                 while ((entry = hv_iternext(input_hv))) {
6012                     SV *restrict grp_name_sv = hv_iterkeysv(entry);
6013
3
                     SV *restrict arr_ref = hv_iterval(input_hv, entry);
6014
6015
1
                     if (SvROK(arr_ref) && SvTYPE(SvRV(arr_ref)) == SVt_PVAV) {
6016                         AV *restrict arr = (AV*)SvRV(arr_ref);
6017                         size_t len = av_len(arr);
6018                         for (size_t k = 0; k <= len; k++) {
6019                             SV **restrict v = av_fetch(arr, k, 0);
6020                             if (v && *v && SvOK(*v)) {
6021
2
                                 av_push(val_av, newSVsv(*v));
6022
0
                                 av_push(grp_av, newSVsv(grp_name_sv));
6023
2
                             }
6024
0
                         }
6025                     } else {
6026
2
                         SvREFCNT_dec(val_av); SvREFCNT_dec(grp_av); SvREFCNT_dec(stacked_hv);
6027
2
                         croak("aov: Hash values must be ArrayRefs when no formula is provided");
6028
2
                     }
6029
2
                 }
6030
6031
2
                 hv_stores(stacked_hv, "Value", newRV_noinc((SV*)val_av));
6032                 hv_stores(stacked_hv, "Group", newRV_noinc((SV*)grp_av));
6033
6034
2
                 // sv_2mortal ensures memory is freed automatically on return or croak
6035                 data_sv = sv_2mortal(newRV_noinc((SV*)stacked_hv));
6036                 formula = "Value~Group";
6037
2
        } else {
6038
2
                 formula = SvPV_nolen(formula_sv);
6039        }
6040
6041
28
        char f_cpy[512];
6042
28
        char *restrict src, *restrict dst, *restrict tilde, *restrict lhs, *restrict rhs, *restrict chunk;
6043
6044
28
        char **restrict terms = NULL, **restrict uniq_terms = NULL, **restrict exp_terms = NULL, **restrict parent_term = NULL;
6045
28
        bool *restrict is_dummy = NULL, *is_interact = NULL;
6046
28
        char **restrict dummy_base = NULL, **restrict dummy_level = NULL;
6047
28
        int *restrict term_map = NULL, *restrict left_idx = NULL, *restrict right_idx = NULL;
6048        unsigned int term_cap = 64, exp_cap = 64, num_terms = 0, num_uniq = 0, p = 0, p_exp = 0;
6049
28
        size_t n = 0, valid_n = 0, i, j;
6050
22
        bool has_intercept = TRUE;
6051
6052
6
        char **restrict row_names = NULL;
6053
6
        HV **restrict row_hashes = NULL;
6054
6
        HV *restrict data_hoa = NULL;
6055        SV *restrict ref = NULL;
6056
28
        HE *restrict entry;
6057
28
        double **restrict X_mat = NULL;
6058
28
        double *restrict Y = NULL;
6059
6060        char **restrict term_base_level = NULL;  /* reference level for each uniq_term (NULL if not categorical) */
6061
2
        if (!SvROK(data_sv)) croak("aov: data is required and must be a reference");
6062
6063        // ========================================================================
6064        // PHASE 1: Data Extraction
6065        // ========================================================================
6066        ref = SvRV(data_sv);
6067        if (SvTYPE(ref) == SVt_PVHV) {
6068
3
                HV*restrict hv = (HV*)ref;
6069
0
                if (hv_iterinit(hv) == 0) croak("aov: Data hash is empty");
6070
0
                entry = hv_iternext(hv);
6071
0
                if (entry) {
6072
0
                         SV*restrict val = hv_iterval(hv, entry);
6073                         if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
6074
0
                                  data_hoa = hv;
6075
0
                                  n = av_len((AV*)SvRV(val)) + 1;
6076                                  Newx(row_names, n, char*);
6077                                  for(i = 0; i < n; i++) {
6078                                      char buf[32]; snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i+1));
6079
3
                                      row_names[i] = savepv(buf);
6080
3
                                  }
6081                         } else if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVHV) {
6082                                  n = hv_iterinit(hv);
6083
3
                                  Newx(row_names, n, char*); Newx(row_hashes, n, HV*);
6084
3
                                  i = 0;
6085
3
                                  while ((entry = hv_iternext(hv))) {
6086                                      I32 len;
6087
45
                                      row_names[i] = savepv(hv_iterkey(entry, &len));
6088
42
                                      row_hashes[i] = (HV*)SvRV(hv_iterval(hv, entry));
6089
42
                                      i++;
6090
42
                                  }
6091
42
                         } else croak("aov: Hash values must be ArrayRefs (HoA) or HashRefs (HoH)");
6092                }
6093        } else if (SvTYPE(ref) == SVt_PVAV) {
6094                AV*restrict av = (AV*)ref;
6095
3
                n = av_len(av) + 1;
6096
12
                Newx(row_names, n, char*);
6097
9
                Newx(row_hashes, n, HV*);
6098
9
                for (i = 0; i < n; i++) {
6099
9
                         SV**restrict val = av_fetch(av, i, 0);
6100                         if (val && SvROK(*val) && SvTYPE(SvRV(*val)) == SVt_PVHV) {
6101                                  row_hashes[i] = (HV*)SvRV(*val);
6102
3
                                  char buf[32];
6103
3
                                  snprintf(buf, sizeof(buf), "%lu", (unsigned long)(i + 1));
6104
3
                                  row_names[i] = savepv(buf);
6105
0
                         } else {
6106
0
                                  for (size_t k = 0; k < i; k++) Safefree(row_names[k]);
6107                                  Safefree(row_names); Safefree(row_hashes);
6108
3
                                  croak("aov: Array values must be HashRefs (AoH)");
6109
3
                         }
6110                }
6111        } else croak("aov: Data must be an Array or Hash reference");
6112
6113
3
        // ========================================================================
6114
3
        // PHASE 2: Formula Parsing & `.` Expansion
6115
3
        // ========================================================================
6116
3
        src = (char*)formula; dst = f_cpy;
6117
3
        while (*src && (dst - f_cpy < 511)) { if (!isspace(*src)) { *dst++ = *src; } src++; }
6118        *dst = '\0';
6119
6120
3
        tilde = strchr(f_cpy, '~');
6121
3
        if (!tilde) {
6122
3
                  for (i = 0; i < n; i++) Safefree(row_names[i]);
6123                  Safefree(row_names); if (row_hashes) Safefree(row_hashes);
6124
12
                  croak("aov: invalid formula, missing '~'");
6125
9
        }
6126
9
        *tilde = '\0';
6127
9
        lhs = f_cpy;
6128        rhs = tilde + 1;
6129
6130
9
        char *restrict p_idx;
6131        while ((p_idx = strstr(rhs, "-1")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6132
9
        while ((p_idx = strstr(rhs, "+0")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6133        while ((p_idx = strstr(rhs, "0+")) != NULL) { has_intercept = FALSE; memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6134        if (rhs[0] == '0' && rhs[1] == '\0')        { has_intercept = FALSE; rhs[0] = '\0'; }
6135        while ((p_idx = strstr(rhs, "+1")) != NULL) { memmove(p_idx, p_idx + 2, strlen(p_idx + 2) + 1); }
6136
3
        if (rhs[0] == '1' && rhs[1] == '\0')        { rhs[0] = '\0'; }
6137
3
        else if (rhs[0] == '1' && rhs[1] == '+')    { memmove(rhs, rhs + 2, strlen(rhs + 2) + 1); }
6138
6139        while ((p_idx = strstr(rhs, "++")) != NULL) memmove(p_idx, p_idx + 1, strlen(p_idx + 1) + 1);
6140        if (rhs[0] == '+') memmove(rhs, rhs + 1, strlen(rhs + 1) + 1);
6141
3
        size_t len_rhs = strlen(rhs);
6142
3
        if (len_rhs > 0 && rhs[len_rhs - 1] == '+') rhs[len_rhs - 1] = '\0';
6143
6144
3
        char rhs_expanded[2048] = "";
6145
3
        size_t rhs_len = 0;
6146        chunk = strtok(rhs, "+");
6147
3
        while (chunk != NULL) {
6148                if (strcmp(chunk, ".") == 0) {
6149                         AV *restrict cols = get_all_columns(data_hoa, row_hashes, n);
6150                         for (size_t c = 0; c <= av_len(cols); c++) {
6151                                  SV **restrict col_sv = av_fetch(cols, c, 0);
6152                                  if (col_sv && SvOK(*col_sv)) {
6153                                      const char *restrict col_name = SvPV_nolen(*col_sv);
6154                                      if (strcmp(col_name, lhs) != 0) {
6155
6
                                          size_t slen = strlen(col_name);
6156
6
                                          if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
6157
6
                                              if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
6158
6
                                              strcat(rhs_expanded, col_name);
6159
6
                                              rhs_len += slen;
6160                                          }
6161                                      }
6162
6
                                  }
6163
6
                         }
6164
6
                         SvREFCNT_dec(cols);
6165                } else {
6166                         size_t slen = strlen(chunk);
6167                         if (rhs_len + slen + 2 < sizeof(rhs_expanded)) {
6168
6
                                  if (rhs_len > 0) { strcat(rhs_expanded, "+"); rhs_len++; }
6169
6
                                  strcat(rhs_expanded, chunk);
6170
6
                                  rhs_len += slen;
6171                         }
6172                }
6173                chunk = strtok(NULL, "+");
6174
6
        }
6175
0
        // Setup arrays safely
6176        Newx(terms, term_cap, char*);
6177        Newx(uniq_terms, term_cap, char*);
6178        Newx(exp_terms, exp_cap, char*); Newx(parent_term, exp_cap, char*);
6179
8
        Newx(is_dummy, exp_cap, bool); Newx(is_interact, exp_cap, bool);
6180
2
        Newx(dummy_base, exp_cap, char*); Newx(dummy_level, exp_cap, char*);
6181
2
        Newx(term_map, exp_cap, int); Newx(left_idx, exp_cap, int); Newx(right_idx, exp_cap, int);
6182        if (has_intercept) { terms[num_terms++] = savepv("Intercept"); }
6183
2
        if (strlen(rhs_expanded) > 0) {
6184
2
                chunk = strtok(rhs_expanded, "+");
6185
2
                while (chunk != NULL) {
6186
1
                         if (num_terms >= term_cap - 3) {
6187
0
                                  term_cap *= 2;
6188
0
                                  Renew(terms, term_cap, char*); Renew(uniq_terms, term_cap, char*);
6189                         }
6190                         char *restrict star = strchr(chunk, '*');
6191                         if (star) {
6192
6
                                  *star = '\0';
6193
0
                                  char *restrict left = chunk;
6194
6
                                  char *right = star + 1;
6195
0
                                  char *restrict c_l = strchr(left, '^');
6196                                  if (c_l && strncmp(left, "I(", 2) != 0) *c_l = '\0';
6197
6
                                  char *restrict c_r = strchr(right, '^'); if (c_r && strncmp(right, "I(", 2) != 0) *c_r = '\0';
6198
0
                                  terms[num_terms++] = savepv(left);
6199
6
                                  terms[num_terms++] = savepv(right);
6200
0
                                  size_t inter_len = strlen(left) + strlen(right) + 2;
6201                                  terms[num_terms] = (char*)safemalloc(inter_len);
6202
6
                                  snprintf(terms[num_terms++], inter_len, "%s:%s", left, right);
6203
6
                         } else {
6204
6
                                  char *restrict c_chunk = strchr(chunk, '^');
6205
6
                                  if (c_chunk && strncmp(chunk, "I(", 2) != 0) *c_chunk = '\0';
6206                                  terms[num_terms++] = savepv(chunk);
6207                         }
6208
6
                         chunk = strtok(NULL, "+");
6209
6
                }
6210
32
        }
6211
6212
26
        for (i = 0; i < num_terms; i++) {
6213
26
                  bool found = FALSE;
6214
26
                  for (size_t k = 0; k < num_uniq; k++) {
6215
26
                        if (strcmp(terms[i], uniq_terms[k]) == 0) { found = TRUE; break; }
6216
26
                  }
6217
26
                  if (!found) uniq_terms[num_uniq++] = savepv(terms[i]);
6218
26
        }
6219        p = num_uniq;
6220
6221        Newxz(term_base_level, num_uniq, char*);
6222
6223
6
        /* PHASE 3: Categorical & Interaction Expansion */
6224
6
        for (j = 0; j < p; j++) {
6225
27
                if (p_exp + 64 >= exp_cap) {
6226
21
                        exp_cap *= 2;
6227
21
                        Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
6228
21
                        Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
6229
21
                        Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
6230
21
                        Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
6231
21
                }
6232
6233
21
                if (strcmp(uniq_terms[j], "Intercept") == 0) {
6234                        exp_terms[p_exp] = savepv("Intercept");
6235                        parent_term[p_exp] = savepv("Intercept");
6236                        is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
6237                        term_map[p_exp] = j;
6238
6
                        p_exp++;
6239
5
                        continue;
6240                }
6241
6242
4
                char *restrict colon = strchr(uniq_terms[j], ':');
6243
4
                if (colon) {
6244
4
                        char left[256], right[256];
6245                        strncpy(left, uniq_terms[j], colon - uniq_terms[j]);
6246
4
                        left[colon - uniq_terms[j]] = '\0';
6247                        strcpy(right, colon + 1);
6248
6249
3
                        int *restrict l_indices = (int*)safemalloc(p_exp * sizeof(int)); int l_count = 0;
6250
3
                        int *restrict r_indices = (int*)safemalloc(p_exp * sizeof(int)); int r_count = 0;
6251                        for (size_t e = 0; e < p_exp; e++) {
6252
3
                                if (strcmp(parent_term[e], left) == 0) l_indices[l_count++] = e;
6253
3
                                if (strcmp(parent_term[e], right) == 0) r_indices[r_count++] = e;
6254                        }
6255
6256
0
                        if (l_count == 0 || r_count == 0) {
6257
3
                                Safefree(l_indices); Safefree(r_indices);
6258
0
                                croak("aov: Interaction term '%s' requires its main effects to be explicitly included in the formula", uniq_terms[j]);
6259
0
                        } else {
6260                                for (int li = 0; li < l_count; li++) {
6261                                         for (int ri = 0; ri < r_count; ri++) {
6262
3
                                                  if (p_exp >= exp_cap) {
6263
3
                                                      exp_cap *= 2;
6264
3
                                                      Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
6265                                                      Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
6266
3
                                                      Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
6267
3
                                                      Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
6268
3
                                                  }
6269                                                  size_t t_len = strlen(exp_terms[l_indices[li]]) + strlen(exp_terms[r_indices[ri]]) + 2;
6270                                                  exp_terms[p_exp] = (char*)safemalloc(t_len);
6271                                                  snprintf(exp_terms[p_exp], t_len, "%s:%s", exp_terms[l_indices[li]], exp_terms[r_indices[ri]]);
6272
3
                                                  parent_term[p_exp] = savepv(uniq_terms[j]);
6273
3
                                                  is_dummy[p_exp] = FALSE; is_interact[p_exp] = TRUE;
6274                                                  left_idx[p_exp] = l_indices[li];
6275
3
                                                  right_idx[p_exp] = r_indices[ri];
6276
3
                                                  term_map[p_exp] = j;
6277
3
                                                  p_exp++;
6278
3
                                         }
6279                                }
6280
3
                        }
6281                        Safefree(l_indices); Safefree(r_indices);
6282
3
                } else {
6283
3
                        if (is_column_categorical(data_hoa, row_hashes, n, uniq_terms[j])) {
6284
3
                                char **restrict levels = NULL;
6285
3
                                unsigned int num_levels = 0, levels_cap = 8;
6286                                Newx(levels, levels_cap, char*);
6287
3
                                for (i = 0; i < n; i++) {
6288
3
                                         char* str_val = get_data_string_alloc(data_hoa, row_hashes, i, uniq_terms[j]);
6289
3
                                         if (str_val) {
6290
3
                                                  bool found = FALSE;
6291                                                  for (size_t l = 0; l < num_levels; l++) {
6292
3
                                                      if (strcmp(levels[l], str_val) == 0) { found = TRUE; break; }
6293                                                  }
6294                                                  if (!found) {
6295                                                      if (num_levels >= levels_cap) { levels_cap *= 2; Renew(levels, levels_cap, char*); }
6296                                                      levels[num_levels++] = savepv(str_val);
6297                                                  }
6298                                                  Safefree(str_val);
6299                                         }
6300                                }
6301
6302                                if (num_levels > 0) {
6303                                         for (size_t l1 = 0; l1 < num_levels - 1; l1++) {
6304                                                  for (size_t l2 = l1 + 1; l2 < num_levels; l2++) {
6305                                                      if (strcmp(levels[l1], levels[l2]) > 0) {
6306                                                          char *tmp = levels[l1]; levels[l1] = levels[l2]; levels[l2] = tmp;
6307                                                      }
6308                                                  }
6309                                         }
6310
6311                                         term_base_level[j] = savepv(levels[0]);
6312
6313                                         for (size_t l = 1; l < num_levels; l++) {
6314                                                  if (p_exp >= exp_cap) {
6315                                                      exp_cap *= 2;
6316                                                      Renew(exp_terms, exp_cap, char*); Renew(parent_term, exp_cap, char*);
6317                                                      Renew(is_dummy, exp_cap, bool); Renew(is_interact, exp_cap, bool);
6318                                                      Renew(dummy_base, exp_cap, char*); Renew(dummy_level, exp_cap, char*);
6319                                                      Renew(term_map, exp_cap, int); Renew(left_idx, exp_cap, int); Renew(right_idx, exp_cap, int);
6320                                                  }
6321                                                  size_t t_len = strlen(uniq_terms[j]) + strlen(levels[l]) + 1;
6322                                                  exp_terms[p_exp] = (char*)safemalloc(t_len);
6323                                                  snprintf(exp_terms[p_exp], t_len, "%s%s", uniq_terms[j], levels[l]);
6324                                                  parent_term[p_exp] = savepv(uniq_terms[j]);
6325                                                  is_dummy[p_exp] = TRUE; is_interact[p_exp] = FALSE;
6326                                                  dummy_base[p_exp] = savepv(uniq_terms[j]);
6327                                                  dummy_level[p_exp] = savepv(levels[l]);
6328                                                  term_map[p_exp] = j;
6329                                                  p_exp++;
6330                                         }
6331                                         for (size_t l = 0; l < num_levels; l++) Safefree(levels[l]);
6332                                         Safefree(levels);
6333                                } else {
6334                                         Safefree(levels);
6335                                         exp_terms[p_exp] = savepv(uniq_terms[j]);
6336                                         parent_term[p_exp] = savepv(uniq_terms[j]);
6337                                         is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
6338                                         term_map[p_exp] = j;
6339                                         p_exp++;
6340                                }
6341                        } else {
6342                                exp_terms[p_exp] = savepv(uniq_terms[j]);
6343                                parent_term[p_exp] = savepv(uniq_terms[j]);
6344                                is_dummy[p_exp] = FALSE; is_interact[p_exp] = FALSE;
6345                                term_map[p_exp] = j;
6346                                p_exp++;
6347                        }
6348                }
6349        }
6350        X_mat = (double**)safemalloc(n * sizeof(double*));
6351        for(i = 0; i < n; i++) X_mat[i] = (double*)safemalloc(p_exp * sizeof(double));
6352        Newx(Y, n, double);
6353
6354        /* PHASE 4: Matrix Construction & Listwise Deletion */
6355        for (i = 0; i < n; i++) {
6356                double y_val = evaluate_term(data_hoa, row_hashes, i, lhs);
6357                if (isnan(y_val)) { Safefree(row_names[i]); continue; }
6358                bool row_ok = TRUE;
6359                double *restrict row_x = (double*)safemalloc(p_exp * sizeof(double));
6360                for (j = 0; j < p_exp; j++) {
6361                          if (strcmp(exp_terms[j], "Intercept") == 0) {
6362                                   row_x[j] = 1.0;
6363                          } else if (is_interact[j]) {
6364                                   row_x[j] = row_x[left_idx[j]] * row_x[right_idx[j]];
6365                          } else if (is_dummy[j]) {
6366                                   char*restrict str_val = get_data_string_alloc(data_hoa, row_hashes, i, dummy_base[j]);
6367                                   if (str_val) {
6368                                       row_x[j] = (strcmp(str_val, dummy_level[j]) == 0) ? 1.0 : 0.0;
6369                                       Safefree(str_val);
6370                                   } else { row_ok = FALSE; break; }
6371                          } else {
6372                                   row_x[j] = evaluate_term(data_hoa, row_hashes, i, parent_term[j]);
6373                                   if (isnan(row_x[j])) { row_ok = FALSE; break; }
6374                          }
6375                }
6376                if (!row_ok) { Safefree(row_names[i]); Safefree(row_x); continue; }
6377                Y[valid_n] = y_val;
6378                for (j = 0; j < p_exp; j++) X_mat[valid_n][j] = row_x[j];
6379                valid_n++;
6380                Safefree(row_x);
6381                Safefree(row_names[i]);
6382        }
6383        Safefree(row_names);
6384        if (valid_n <= p_exp) {
6385                // Full Clean Up
6386                for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
6387                for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
6388                for (j = 0; j < p_exp; j++) {
6389                         Safefree(exp_terms[j]); Safefree(parent_term[j]);
6390                         if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
6391                }
6392                Safefree(exp_terms); Safefree(parent_term);
6393                Safefree(is_dummy); Safefree(is_interact);
6394                Safefree(dummy_base); Safefree(dummy_level);
6395                Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
6396                for(i = 0; i < n; i++) Safefree(X_mat[i]);
6397                Safefree(X_mat); Safefree(Y);
6398                if (row_hashes) Safefree(row_hashes);
6399                for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); }
6400                Safefree(term_base_level);
6401                croak("aov: 0 degrees of freedom (too many NAs or parameters > observations)");
6402        }
6403        /* PHASE 5: Math & Output Formatting */
6404        bool *restrict aliased_qr = (bool*)safemalloc(p_exp * sizeof(bool));
6405        size_t *restrict rank_map = (size_t*)safemalloc(p_exp * sizeof(size_t));
6406        apply_householder_aov(X_mat, Y, valid_n, p_exp, aliased_qr, rank_map);
6407        double *restrict term_ss;
6408        int *restrict term_df;
6409        Newxz(term_ss, num_uniq, double);
6410        Newxz(term_df, num_uniq, int);
6411        for (i = 0; i < p_exp; i++) {
6412                if (strcmp(exp_terms[i], "Intercept") == 0) continue;
6413                if (aliased_qr[i]) continue;
6414                int t_idx = term_map[i];
6415                size_t r_k = rank_map[i];
6416                term_ss[t_idx] += Y[r_k] * Y[r_k];
6417                term_df[t_idx] += 1;
6418        }
6419        int rank = 0;
6420        for (i = 0; i < p_exp; i++) {
6421                  if (!aliased_qr[i]) rank++;
6422        }
6423        double rss_prev = 0.0;
6424        for (i = rank; i < valid_n; i++) {
6425                  rss_prev += Y[i] * Y[i];
6426        }
6427        int res_df = valid_n - rank;
6428        double ms_res = (res_df > 0) ? rss_prev / res_df : 0.0;
6429        HV*restrict ret_hash = newHV();
6430        for (j = 0; j < num_uniq; j++) {
6431                  if (strcmp(uniq_terms[j], "Intercept") == 0) continue;
6432                  HV*restrict term_stats = newHV();
6433                  double ss = term_ss[j];
6434                  int df = term_df[j];
6435                  double ms = (df > 0) ? ss / df : 0.0;
6436
6437                  hv_stores(term_stats, "Df", newSViv(df));
6438                  hv_stores(term_stats, "Sum Sq", newSVnv(ss));
6439                  hv_stores(term_stats, "Mean Sq", newSVnv(ms));
6440                  if (ms_res > 0.0 && df > 0) {
6441                        double f_val = ms / ms_res;
6442                        hv_stores(term_stats, "F value", newSVnv(f_val));
6443                        hv_stores(term_stats, "Pr(>F)", newSVnv(1.0 - pf(f_val, (double)df, (double)res_df)));
6444                  } else {
6445                        hv_stores(term_stats, "F value", newSVnv(NAN));
6446                        hv_stores(term_stats, "Pr(>F)", newSVnv(NAN));
6447                  }
6448                  hv_store(ret_hash, uniq_terms[j], strlen(uniq_terms[j]), newRV_noinc((SV*)term_stats), 0);
6449        }
6450        HV*restrict res_stats = newHV();
6451        hv_stores(res_stats, "Df", newSViv(res_df));
6452        hv_stores(res_stats, "Sum Sq", newSVnv(rss_prev));
6453        hv_stores(res_stats, "Mean Sq", newSVnv(ms_res));
6454        hv_stores(ret_hash, "Residuals", newRV_noinc((SV*)res_stats));
6455        {
6456                  HV *restrict tgt_hoa = data_hoa;
6457                  HV **restrict tgt_row_hashes = row_hashes;
6458                  size_t tgt_n = n;
6459                  // Route evaluation to the original unstacked HoA when a formula was implied
6460                  if (is_stacked) {
6461                      tgt_hoa = (HV*)SvRV(orig_data_sv);
6462                      tgt_row_hashes = NULL;
6463                      hv_iterinit(tgt_hoa);
6464                      HE *restrict e = hv_iternext(tgt_hoa);
6465                      if (e) {
6466                          SV *val = hv_iterval(tgt_hoa, e);
6467                          if (SvROK(val) && SvTYPE(SvRV(val)) == SVt_PVAV) {
6468                              tgt_n = av_len((AV*)SvRV(val)) + 1;
6469                          }
6470                      }
6471                  }
6472                  AV *restrict all_cols = get_all_columns(tgt_hoa, tgt_row_hashes, tgt_n);
6473                  HV *restrict mean_hv  = newHV();
6474                  HV *restrict size_hv  = newHV();
6475                  for (size_t c = 0; c <= (size_t)av_len(all_cols); c++) {
6476                      SV **restrict col_sv = av_fetch(all_cols, c, 0);
6477                      if (!col_sv || !SvOK(*col_sv)) continue;
6478                      const char *restrict col_name = SvPV_nolen(*col_sv);
6479
6480                      double col_sum = 0.0;
6481                      IV      col_count = 0;
6482                      for (i = 0; i < tgt_n; i++) {
6483                          double val = evaluate_term(tgt_hoa, tgt_row_hashes, i, col_name);
6484                          if (!isnan(val)) { col_sum += val; col_count++; }
6485                      }
6486
6487                      double col_mean = (col_count > 0) ? col_sum / col_count : NAN;
6488                      hv_store(mean_hv, col_name, strlen(col_name), newSVnv(col_mean), 0);
6489                      hv_store(size_hv, col_name, strlen(col_name), newSViv(col_count), 0);
6490                  }
6491                  SvREFCNT_dec(all_cols);
6492                  HV *restrict gs_hv = newHV();
6493                  hv_stores(gs_hv, "mean", newRV_noinc((SV*)mean_hv));
6494                  hv_stores(gs_hv, "size", newRV_noinc((SV*)size_hv));
6495                  hv_stores(ret_hash, "group_stats", newRV_noinc((SV*)gs_hv));
6496        }
6497        /* Deep Cleanup */
6498        for (i = 0; i < num_terms; i++) Safefree(terms[i]); Safefree(terms);
6499        for (i = 0; i < num_uniq; i++) Safefree(uniq_terms[i]); Safefree(uniq_terms);
6500        for (j = 0; j < p_exp; j++) {
6501                  Safefree(exp_terms[j]); Safefree(parent_term[j]);
6502                  if (is_dummy[j]) { Safefree(dummy_base[j]); Safefree(dummy_level[j]); }
6503        }
6504        Safefree(exp_terms); Safefree(parent_term);
6505        Safefree(is_dummy); Safefree(is_interact);
6506        Safefree(dummy_base); Safefree(dummy_level);
6507        Safefree(term_map); Safefree(left_idx); Safefree(right_idx);
6508        Safefree(term_ss); Safefree(term_df);
6509        for (i = 0; i < n; i++) Safefree(X_mat[i]);
6510        Safefree(X_mat); Safefree(Y);
6511        Safefree(aliased_qr); Safefree(rank_map);
6512        for (i = 0; i < num_uniq; i++) { if (term_base_level[i]) Safefree(term_base_level[i]); }
6513        Safefree(term_base_level);
6514        if (row_hashes) Safefree(row_hashes);
6515        RETVAL = newRV_noinc((SV*)ret_hash);
6516        }
6517OUTPUT:
6518    RETVAL
6519
6520PROTOTYPES: DISABLE
6521
6522SV* fisher_test(...)
6523CODE:
6524{
6525        if (items < 1) croak("fisher_test requires at least a data reference");
6526
6527        SV*restrict data_ref = ST(0);
6528        double conf_level = 0.95;
6529        const char*restrict alternative = "two.sided";
6530
6531        // Parse named arguments
6532        for (unsigned short int i = 1; i < items; i += 2) {
6533                if (i + 1 >= items) croak("fisher_test: odd number of arguments");
6534                const char*restrict key = SvPV_nolen(ST(i));
6535                SV*restrict val = ST(i + 1);
6536                if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) {
6537                        conf_level = SvNV(val);
6538                } else if (strEQ(key, "alternative")) {
6539                        alternative = SvPV_nolen(val);
6540                }
6541        }
6542
6543        if (!SvROK(data_ref)) croak("fisher_test requires a reference to an Array or Hash");
6544        SV*restrict deref = SvRV(data_ref);
6545        size_t a = 0, b = 0, c = 0, d = 0;
6546        // Extract Data
6547        if (SvTYPE(deref) == SVt_PVAV) {
6548          AV*restrict outer = (AV*)deref;
6549          if (av_len(outer) != 1) croak("Outer array must have exactly 2 rows");
6550          SV**restrict row1_ptr = av_fetch(outer, 0, 0);
6551          SV**restrict row2_ptr = av_fetch(outer, 1, 0);
6552          if (row1_ptr && row2_ptr && SvROK(*row1_ptr) && SvROK(*row2_ptr)) {
6553                   AV*restrict row1 = (AV*)SvRV(*row1_ptr);
6554                   AV*restrict row2 = (AV*)SvRV(*row2_ptr);
6555                   SV**restrict a_ptr = av_fetch(row1, 0, 0);
6556                   SV**restrict b_ptr = av_fetch(row1, 1, 0);
6557                   SV**restrict c_ptr = av_fetch(row2, 0, 0);
6558                   SV**restrict d_ptr = av_fetch(row2, 1, 0);
6559                   a = (a_ptr && SvOK(*a_ptr)) ? SvIV(*a_ptr) : 0;
6560                   b = (b_ptr && SvOK(*b_ptr)) ? SvIV(*b_ptr) : 0;
6561                   c = (c_ptr && SvOK(*c_ptr)) ? SvIV(*c_ptr) : 0;
6562                   d = (d_ptr && SvOK(*d_ptr)) ? SvIV(*d_ptr) : 0;
6563          } else {
6564                  croak("Invalid 2D Array structure");
6565          }
6566        } else if (SvTYPE(deref) == SVt_PVHV) {
6567                // Fixed 2D Hash Logic: Sort keys lexically to enforce structured rows/columns
6568                HV*restrict outer = (HV*)deref;
6569                if (hv_iterinit(outer) != 2) croak("Outer hash must have exactly 2 keys");
6570                HE*restrict he1 = hv_iternext(outer);
6571                HE*restrict he2 = hv_iternext(outer);
6572                if (!he1 || !he2) croak("Invalid outer hash");
6573                const char*restrict k1 = SvPV_nolen(hv_iterkeysv(he1));
6574                const char*restrict k2 = SvPV_nolen(hv_iterkeysv(he2));
6575                HE*restrict row1_he = (strcmp(k1, k2) < 0) ? he1 : he2;
6576                HE*restrict row2_he = (strcmp(k1, k2) < 0) ? he2 : he1;
6577                SV*restrict row1_sv = hv_iterval(outer, row1_he);
6578                SV*restrict row2_sv = hv_iterval(outer, row2_he);
6579                if (!SvROK(row1_sv) || SvTYPE(SvRV(row1_sv)) != SVt_PVHV ||
6580                        !SvROK(row2_sv) || SvTYPE(SvRV(row2_sv)) != SVt_PVHV) {
6581                        croak("Inner elements must be hashes");
6582                }
6583                HV*restrict in1 = (HV*)SvRV(row1_sv);
6584                HV*restrict in2 = (HV*)SvRV(row2_sv);
6585                if (hv_iterinit(in1) != 2 || hv_iterinit(in2) != 2) croak("Inner hashes must have exactly 2 keys");
6586                HE*restrict in1_he1 = hv_iternext(in1);
6587                HE*restrict in1_he2 = hv_iternext(in1);
6588                const char*restrict in1_k1 = SvPV_nolen(hv_iterkeysv(in1_he1));
6589                const char*restrict in1_k2 = SvPV_nolen(hv_iterkeysv(in1_he2));
6590                HE*restrict in1_c1 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he1 : in1_he2;
6591                HE*restrict in1_c2 = (strcmp(in1_k1, in1_k2) < 0) ? in1_he2 : in1_he1;
6592                HE*restrict in2_he1 = hv_iternext(in2);
6593                HE*restrict in2_he2 = hv_iternext(in2);
6594                const char*restrict in2_k1 = SvPV_nolen(hv_iterkeysv(in2_he1));
6595                const char*restrict in2_k2 = SvPV_nolen(hv_iterkeysv(in2_he2));
6596                HE*restrict in2_c1 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he1 : in2_he2;
6597                HE*restrict in2_c2 = (strcmp(in2_k1, in2_k2) < 0) ? in2_he2 : in2_he1;
6598                a = (hv_iterval(in1, in1_c1) && SvOK(hv_iterval(in1, in1_c1))) ? SvIV(hv_iterval(in1, in1_c1)) : 0;
6599                b = (hv_iterval(in1, in1_c2) && SvOK(hv_iterval(in1, in1_c2))) ? SvIV(hv_iterval(in1, in1_c2)) : 0;
6600                c = (hv_iterval(in2, in2_c1) && SvOK(hv_iterval(in2, in2_c1))) ? SvIV(hv_iterval(in2, in2_c1)) : 0;
6601                d = (hv_iterval(in2, in2_c2) && SvOK(hv_iterval(in2, in2_c2))) ? SvIV(hv_iterval(in2, in2_c2)) : 0;
6602        } else {
6603          croak("Input must be a 2D Array or 2D Hash");
6604        }
6605
6606        // Perform Calculations via Helpers
6607        double p_val = exact_p_value(a, b, c, d, alternative);
6608        double mle_or, ci_low, ci_high;
6609        calculate_exact_stats(a, b, c, d, conf_level, alternative, &mle_or, &ci_low, &ci_high);
6610
6611        // Construct the Return HashRef purely in C
6612        HV*restrict ret_hash = newHV();
6613        hv_stores(ret_hash, "method", newSVpv("Fisher's Exact Test for Count Data", 0));
6614        hv_stores(ret_hash, "alternative", newSVpv(alternative, 0));
6615        AV*restrict ci_array = newAV();
6616        av_push(ci_array, newSVnv(ci_low));
6617        av_push(ci_array, newSVnv(ci_high));
6618        hv_stores(ret_hash, "conf_int", newRV_noinc((SV*)ci_array));
6619        HV*restrict est_hash = newHV();
6620        hv_stores(ret_hash, "estimate", newRV_noinc((SV*)est_hash));
6621        hv_stores(est_hash, "odds ratio", newSVnv(mle_or));
6622        hv_stores(ret_hash, "p_value", newSVnv(p_val));
6623        // Return the HashRef
6624        RETVAL = newRV_noinc((SV*)ret_hash);
6625}
6626OUTPUT:
6627  RETVAL
6628
6629SV* power_t_test(...)
6630CODE:
6631{
6632        SV*restrict sv_n = NULL;
6633        SV*restrict sv_delta = NULL;
6634        SV*restrict sv_sd = NULL;
6635        SV*restrict sv_sig_level = NULL;
6636        SV*restrict sv_power = NULL;
6637
6638        const char* restrict type = "two.sample";
6639        const char* restrict alternative = "two.sided";
6640        bool strict = FALSE;
6641        double tol = pow(2.2204460492503131e-16, 0.25);
6642
6643        if (items % 2 != 0) croak("Usage: power_t_test(n => 30, delta => 0.5, sd => 1.0, ...)");
6644        for (unsigned short int i = 0; i < items; i += 2) {
6645          const char* restrict key = SvPV_nolen(ST(i));
6646          SV* restrict val = ST(i+1);
6647
6648          if      (strEQ(key, "n"))           sv_n = val;
6649          else if (strEQ(key, "delta"))       sv_delta = val;
6650          else if (strEQ(key, "sd"))          sv_sd = val;
6651          else if (strEQ(key, "sig.level") || strEQ(key, "sig_level")) sv_sig_level = val;
6652          else if (strEQ(key, "power"))       sv_power = val;
6653          else if (strEQ(key, "type"))        type = SvPV_nolen(val);
6654          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
6655          else if (strEQ(key, "strict"))      strict = SvTRUE(val);
6656          else if (strEQ(key, "tol"))         tol = SvNV(val);
6657          else croak("power_t_test: unknown argument '%s'", key);
6658        }
6659
6660        bool is_null_n = (!sv_n || !SvOK(sv_n));
6661        bool is_null_delta = (!sv_delta || !SvOK(sv_delta));
6662        bool is_null_power = (!sv_power || !SvOK(sv_power));
6663        bool is_null_sd = (sv_sd && !SvOK(sv_sd));
6664        bool is_null_sig_level = (sv_sig_level && !SvOK(sv_sig_level));
6665
6666        unsigned int missing_count = 0;
6667        if (is_null_n) missing_count++;
6668        if (is_null_delta) missing_count++;
6669        if (is_null_power) missing_count++;
6670        if (is_null_sd) missing_count++;
6671        if (is_null_sig_level) missing_count++;
6672
6673        if (missing_count != 1) {
6674          croak("power_t_test: exactly one of 'n', 'delta', 'sd', 'power', and 'sig_level' must be undef/NULL");
6675        }
6676
6677        double n = is_null_n ? 0.0 : SvNV(sv_n);
6678        double delta = is_null_delta ? 0.0 : SvNV(sv_delta);
6679        double sd = (!sv_sd || is_null_sd) ? 1.0 : SvNV(sv_sd);
6680        double sig_level = (!sv_sig_level || is_null_sig_level) ? 0.05 : SvNV(sv_sig_level);
6681        double power = is_null_power ? 0.0 : SvNV(sv_power);
6682        short int tsample = (strEQ(type, "one.sample") || strEQ(type, "paired")) ? 1 : 2;
6683        short int tside = (strEQ(alternative, "one.sided") || strEQ(alternative, "greater") || strEQ(alternative, "less")) ? 1 : 2;
6684        if (tside == 2 && !is_null_delta) delta = fabs(delta);
6685        if (is_null_power) {
6686          power = p_body(n, delta, sd, sig_level, tsample, tside, strict);
6687        } else if (is_null_n) {
6688          double low = 2.0, high = 1e7;
6689          while (p_body(high, delta, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
6690          while (high - low > tol) {
6691                   double mid = low + (high - low) / 2.0;
6692                   if (p_body(mid, delta, sd, sig_level, tsample, tside, strict) < power) low = mid;
6693                   else high = mid;
6694          }
6695          n = low + (high - low) / 2.0;
6696        } else if (is_null_sd) {
6697          double low = delta * 1e-7, high = delta * 1e7;
6698          while (high - low > tol) {
6699                   double mid = low + (high - low) / 2.0;
6700                   if (p_body(n, delta, mid, sig_level, tsample, tside, strict) > power) low = mid;
6701                   else high = mid;
6702          }
6703          sd = low + (high - low) / 2.0;
6704        } else if (is_null_delta) {
6705          double low = sd * 1e-7, high = sd * 1e7;
6706          while (p_body(n, high, sd, sig_level, tsample, tside, strict) < power && high < 1e12) high *= 2.0;
6707          while (high - low > tol) {
6708                   double mid = low + (high - low) / 2.0;
6709                   if (p_body(n, mid, sd, sig_level, tsample, tside, strict) < power) low = mid;
6710                   else high = mid;
6711          }
6712          delta = low + (high - low) / 2.0;
6713        } else if (is_null_sig_level) {
6714          double low = 1e-10, high = 1.0 - 1e-10;
6715          while (high - low > tol) {
6716                   double mid = low + (high - low) / 2.0;
6717                   if (p_body(n, delta, sd, mid, tsample, tside, strict) < power) low = mid;
6718                   else high = mid;
6719          }
6720          sig_level = low + (high - low) / 2.0;
6721        }
6722        HV*restrict ret = newHV();
6723        hv_stores(ret, "n", newSVnv(n));
6724        hv_stores(ret, "delta", newSVnv(delta));
6725        hv_stores(ret, "sd", newSVnv(sd));
6726        hv_stores(ret, "sig.level", newSVnv(sig_level));
6727        hv_stores(ret, "power", newSVnv(power));
6728        hv_stores(ret, "alternative", newSVpv(alternative, 0));
6729        const char*restrict m_str = (tsample == 1) ? (strEQ(type, "paired") ? "Paired t test power calculation" : "One-sample t test power calculation") : "Two-sample t test power calculation";
6730        hv_stores(ret, "method", newSVpv(m_str, 0));
6731        const char*restrict n_str = (tsample == 2) ? "n is number in *each* group" : (strEQ(type, "paired") ? "n is number of *pairs*, sd is std.dev. of *differences* within pairs" : "");
6732        if (n_str[0] != '\0') hv_stores(ret, "note", newSVpv(n_str, 0));
6733        RETVAL = newRV_noinc((SV*)ret);
6734}
6735OUTPUT:
6736    RETVAL
6737
6738SV* kruskal_test(...)
6739CODE:
6740{
6741        SV *restrict x_sv = NULL, *restrict g_sv = NULL, *restrict h_sv = NULL;
6742        unsigned int arg_idx = 0;
6743
6744        // 1. Shift positional arguments
6745        //    Accept either: (arrayref, arrayref) or (hashref)
6746        if (arg_idx < items && SvROK(ST(arg_idx))) {
6747          svtype t = SvTYPE(SvRV(ST(arg_idx)));
6748          if (t == SVt_PVAV) {
6749                   x_sv = ST(arg_idx++);
6750          } else if (t == SVt_PVHV) {
6751                   h_sv = ST(arg_idx++);          /* hash-of-arrays shortcut */
6752          }
6753        }
6754        if (!h_sv && arg_idx < items
6755                     && SvROK(ST(arg_idx))
6756                     && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6757          g_sv = ST(arg_idx++);
6758        }
6759        // 2. Parse named arguments (fallback)
6760        for (; arg_idx < items; arg_idx += 2) {
6761          const char *restrict key = SvPV_nolen(ST(arg_idx));
6762          SV         *restrict val = ST(arg_idx + 1);
6763          if      (strEQ(key, "x")) x_sv = val;
6764          else if (strEQ(key, "g")) g_sv = val;
6765          else if (strEQ(key, "h")) h_sv = val;
6766          else croak("kruskal_test: unknown argument '%s'", key);
6767        }
6768        // 3. Mutual-exclusion guard
6769        if (h_sv && (x_sv || g_sv))
6770          croak("kruskal_test: cannot mix 'h' (hash-of-arrays) with 'x'/'g' inputs");
6771
6772        /* ------------------------------------------------------------------ */
6773        /* Shared state filled by whichever input branch runs                 */
6774        /* ------------------------------------------------------------------ */
6775        RankInfo *restrict ri = NULL;
6776        char **restrict group_names = NULL; /* Track names to build group_stats */
6777        size_t valid_n = 0;
6778        size_t k       = 0;
6779
6780        /* ------------------------------------------------------------------ */
6781        /* 4a. Hash-of-arrays input path                                      */
6782        /*     my %x = ( group1 => [...], group2 => [...], ... )              */
6783        /* ------------------------------------------------------------------ */
6784        if (h_sv) {
6785          if (!SvROK(h_sv) || SvTYPE(SvRV(h_sv)) != SVt_PVHV)
6786                   croak("kruskal_test: 'h' must be a HASH reference");
6787          HV *restrict h_hv = (HV*)SvRV(h_sv);
6788          // First pass – validate values and tally total elements
6789          size_t total = 0;
6790          hv_iterinit(h_hv);
6791          HE *restrict he;
6792          while ((he = hv_iternext(h_hv))) {
6793                   SV *restrict val = HeVAL(he);
6794                   if (!SvROK(val) || SvTYPE(SvRV(val)) != SVt_PVAV)
6795                       croak("kruskal_test: every value in 'h' must be an ARRAY reference");
6796                   total += (size_t)(av_len((AV*)SvRV(val)) + 1);
6797          }
6798          if (total < 2) croak("not enough observations");
6799
6800          ri = (RankInfo *)safemalloc(total * sizeof(RankInfo));
6801          size_t num_keys = HvKEYS(h_hv);
6802          group_names = (char **)safecalloc(num_keys, sizeof(char*));
6803          /* Second pass – fill ri[], assigning one group_id per hash key */
6804          size_t group_id = 0;
6805          hv_iterinit(h_hv);
6806          while ((he = hv_iternext(h_hv))) {
6807                   STRLEN klen;
6808                   const char *restrict key_str = HePV(he, klen);
6809                   group_names[group_id] = savepvn(key_str, klen); // Save string key
6810
6811                   AV *restrict av  = (AV*)SvRV(HeVAL(he));
6812                   size_t       n_g = (size_t)(av_len(av) + 1);
6813                   for (size_t i = 0; i < n_g; i++) {
6814                       SV **restrict el = av_fetch(av, i, 0);
6815                       if (el && SvOK(*el) && looks_like_number(*el)) {
6816                           ri[valid_n].val = SvNV(*el);
6817                           ri[valid_n].idx = group_id;   /* group identity */
6818                           valid_n++;
6819                       }
6820                   }
6821                   group_id++;
6822          }
6823          k = group_id;   /* number of unique groups = number of hash keys */
6824
6825        /* ------------------------------------------------------------------ */
6826        /* 4b. Original x / g array-pair input path                           */
6827        /* ------------------------------------------------------------------ */
6828        } else {
6829          if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6830                   croak("kruskal_test: 'x' is a required argument and must be an ARRAY reference");
6831          if (!g_sv || !SvROK(g_sv) || SvTYPE(SvRV(g_sv)) != SVt_PVAV)
6832                   croak("kruskal_test: 'g' is a required argument and must be an ARRAY reference");
6833
6834          AV *restrict x_av = (AV*)SvRV(x_sv);
6835          AV *restrict g_av = (AV*)SvRV(g_sv);
6836          size_t nx = (size_t)(av_len(x_av) + 1);
6837          size_t ng = (size_t)(av_len(g_av) + 1);
6838          if (nx != ng) croak("kruskal_test: 'x' and 'g' must have the same length");
6839          if (nx < 2)   croak("not enough observations");
6840
6841          ri = (RankInfo *)safemalloc(nx * sizeof(RankInfo));
6842          group_names = (char **)safecalloc(nx, sizeof(char*)); // Upper bound
6843
6844          // Map string group names → contiguous integer IDs
6845          HV *restrict group_map    = newHV();
6846          size_t          next_group_id = 0;
6847
6848          for (size_t i = 0; i < nx; i++) {
6849                   SV **restrict x_el = av_fetch(x_av, i, 0);
6850                   SV **restrict g_el = av_fetch(g_av, i, 0);
6851                   if (x_el && SvOK(*x_el) && looks_like_number(*x_el)
6852                            && g_el && SvOK(*g_el)) {
6853                       const char *restrict g_str = SvPV_nolen(*g_el);
6854                       STRLEN               glen  = strlen(g_str);
6855                       SV   **restrict id_sv = hv_fetch(group_map, g_str, glen, 0);
6856                       size_t group_id;
6857                       if (id_sv) {
6858                           group_id = SvUV(*id_sv);
6859                       } else {
6860                           group_id = next_group_id++;
6861                           hv_store(group_map, g_str, glen, newSVuv(group_id), 0);
6862                           group_names[group_id] = savepvn(g_str, glen); // Save string key
6863                       }
6864                       ri[valid_n].val = SvNV(*x_el);
6865                       ri[valid_n].idx = group_id;
6866                       valid_n++;
6867                   }
6868          }
6869          k = next_group_id;
6870          SvREFCNT_dec(group_map);
6871        }
6872
6873        /* ------------------------------------------------------------------ */
6874        /* 5. Shared post-extraction validation                               */
6875        /* ------------------------------------------------------------------ */
6876        if (valid_n < 2 || k < 2) {
6877          Safefree(ri);
6878          if (group_names) {
6879                   for (size_t i = 0; i < k; i++) { if (group_names[i]) Safefree(group_names[i]); }
6880                   Safefree(group_names);
6881          }
6882          if (valid_n < 2) croak("not enough observations");
6883          croak("all observations are in the same group");
6884        }
6885
6886        // 6. Ranking and Tie Accumulation (Reusing LikeR Helper)
6887        bool   has_ties = 0;
6888        double tie_adj  = rank_and_count_ties(ri, valid_n, &has_ties);
6889
6890        // 7. Aggregate Sum of Ranks AND Actual Values by Group
6891        double *restrict group_rank_sums = (double *)safecalloc(k, sizeof(double));
6892        double *restrict group_val_sums  = (double *)safecalloc(k, sizeof(double)); // For Mean
6893        size_t *restrict group_counts    = (size_t *)safecalloc(k, sizeof(size_t));
6894
6895        for (size_t i = 0; i < valid_n; i++) {
6896          size_t g_id = ri[i].idx;
6897          group_rank_sums[g_id] += ri[i].rank;
6898          group_val_sums[g_id]  += ri[i].val;
6899          group_counts[g_id]++;
6900        }
6901
6902        // 8. Calculate STATISTIC
6903        double stat_base = 0.0;
6904        for (size_t i = 0; i < k; i++) {
6905          if (group_counts[i] > 0)
6906                   stat_base += (group_rank_sums[i] * group_rank_sums[i])
6907                                / (double)group_counts[i];
6908        }
6909
6910        double n_d  = (double)valid_n;
6911        double stat = (12.0 * stat_base / (n_d * (n_d + 1.0))) - 3.0 * (n_d + 1.0);
6912        if (tie_adj > 0.0) {
6913          double tie_denom = 1.0 - (tie_adj / (n_d * n_d * n_d - n_d));
6914          stat /= tie_denom;
6915        }
6916        int    df    = (int)k - 1;
6917        double p_val = get_p_value(stat, df);
6918
6919        // 9. Return structured data exactly like R's htest
6920        HV *restrict res = newHV();
6921        hv_stores(res, "statistic", newSVnv(stat));
6922        hv_stores(res, "parameter", newSViv(df));
6923        hv_stores(res, "p_value",   newSVnv(p_val));
6924        hv_stores(res, "p.value",   newSVnv(p_val));
6925        hv_stores(res, "method",    newSVpv("Kruskal-Wallis rank sum test", 0));
6926
6927        // 10. Build the group_stats hash
6928        HV *restrict group_stats = newHV();
6929        HV *restrict stats_mean  = newHV();
6930        HV *restrict stats_size  = newHV();
6931
6932        for (size_t i = 0; i < k; i++) {
6933          if (group_counts[i] > 0 && group_names[i]) {
6934                   double mean = group_val_sums[i] / (double)group_counts[i];
6935                   size_t nlen = strlen(group_names[i]);
6936
6937                   hv_store(stats_mean, group_names[i], nlen, newSVnv(mean), 0);
6938                   hv_store(stats_size, group_names[i], nlen, newSVuv(group_counts[i]), 0);
6939          }
6940          if (group_names[i]) Safefree(group_names[i]); // Clean up name copy
6941        }
6942
6943        // Embed the nested hashes
6944        hv_stores(group_stats, "mean", newRV_noinc((SV*)stats_mean));
6945        hv_stores(group_stats, "size", newRV_noinc((SV*)stats_size));
6946        hv_stores(res, "group_stats",  newRV_noinc((SV*)group_stats));
6947
6948        // Memory Cleanup
6949        Safefree(group_names);    Safefree(group_rank_sums);
6950        Safefree(group_val_sums); Safefree(group_counts);
6951        Safefree(ri);
6952
6953        RETVAL = newRV_noinc((SV*)res);
6954}
6955OUTPUT:
6956    RETVAL
6957
6958SV* var_test(...)
6959CODE:
6960{
6961        SV* restrict x_sv = NULL;
6962        SV* restrict y_sv = NULL;
6963        double ratio = 1.0, conf_level = 0.95;
6964        const char* restrict alternative = "two.sided";
6965        unsigned int arg_idx = 0;
6966
6967        // 1. Shift positional argument 'x' if it's an array reference
6968        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6969          x_sv = ST(arg_idx);
6970          arg_idx++;
6971        }
6972
6973        // 2. Shift positional argument 'y' if it's an array reference
6974        if (arg_idx < items && SvROK(ST(arg_idx)) && SvTYPE(SvRV(ST(arg_idx))) == SVt_PVAV) {
6975          y_sv = ST(arg_idx);
6976          arg_idx++;
6977        }
6978
6979        // Ensure the remaining arguments form complete key-value pairs
6980        if ((items - arg_idx) % 2 != 0) {
6981          croak("Usage: var_test(\\@x, \\@y, key => value, ...)");
6982        }
6983
6984        // --- Parse named arguments from the remaining flat stack ---
6985        for (; arg_idx < items; arg_idx += 2) {
6986          const char* restrict key = SvPV_nolen(ST(arg_idx));
6987          SV* restrict val = ST(arg_idx + 1);
6988
6989          if      (strEQ(key, "x"))           x_sv        = val;
6990          else if (strEQ(key, "y"))           y_sv        = val;
6991          else if (strEQ(key, "ratio"))       ratio       = SvNV(val);
6992          else if (strEQ(key, "conf_level") || strEQ(key, "conf.level")) conf_level = SvNV(val);
6993          else if (strEQ(key, "alternative")) alternative = SvPV_nolen(val);
6994          else croak("var_test: unknown argument '%s'", key);
6995        }
6996
6997        // --- Validate required inputs / types ---
6998        if (!x_sv || !SvROK(x_sv) || SvTYPE(SvRV(x_sv)) != SVt_PVAV)
6999          croak("var_test: 'x' is a required argument and must be an ARRAY reference");
7000        if (!y_sv || !SvROK(y_sv) || SvTYPE(SvRV(y_sv)) != SVt_PVAV)
7001          croak("var_test: 'y' is a required argument and must be an ARRAY reference");
7002
7003        if (ratio <= 0.0 || !isfinite(ratio))
7004          croak("var_test: 'ratio' must be a single positive number");
7005        if (conf_level <= 0.0 || conf_level >= 1.0 || !isfinite(conf_level))
7006          croak("var_test: 'conf.level' must be a single number between 0 and 1");
7007
7008        AV* restrict x_av = (AV*)SvRV(x_sv);
7009        AV* restrict y_av = (AV*)SvRV(y_sv);
7010        size_t nx_raw = av_len(x_av) + 1;
7011        size_t ny_raw = av_len(y_av) + 1;
7012
7013        // --- Computation via Welford's Algorithm (ignoring NaNs) ---
7014        double mean_x = 0.0, M2_x = 0.0;
7015        size_t nx = 0;
7016        for (size_t i = 0; i < nx_raw; i++) {
7017                SV** restrict tv = av_fetch(x_av, i, 0);
7018                if (tv && SvOK(*tv) && looks_like_number(*tv)) {
7019                        double val = SvNV(*tv);
7020                        if (!isnan(val) && isfinite(val)) {
7021                                nx++;
7022                                double delta = val - mean_x;
7023                                mean_x += delta / nx;
7024                                M2_x += delta * (val - mean_x);
7025                        }
7026                }
7027        }
7028
7029        double mean_y = 0.0, M2_y = 0.0;
7030        size_t ny = 0;
7031        for (size_t i = 0; i < ny_raw; i++) {
7032          SV** restrict tv = av_fetch(y_av, i, 0);
7033          if (tv && SvOK(*tv) && looks_like_number(*tv)) {
7034                   double val = SvNV(*tv);
7035                   if (!isnan(val) && isfinite(val)) {
7036                       ny++;
7037                       double delta = val - mean_y;
7038                       mean_y += delta / ny;
7039                       M2_y += delta * (val - mean_y);
7040                   }
7041          }
7042        }
7043
7044        if (nx < 2) croak("not enough 'x' observations");
7045        if (ny < 2) croak("not enough 'y' observations");
7046
7047        double df_x = (double)(nx - 1);
7048        double df_y = (double)(ny - 1);
7049        double var_x = M2_x / df_x;
7050        double var_y = M2_y / df_y;
7051
7052        if (var_y == 0.0) croak("var_test: variance of 'y' is zero (cannot divide by zero)");
7053
7054        // --- Statistics Math ---
7055        double estimate = var_x / var_y;
7056        double statistic = estimate / ratio;
7057
7058        double p_val = pf(statistic, df_x, df_y);
7059        double ci_lower = 0.0, ci_upper = INFINITY;
7060
7061        if (strcmp(alternative, "less") == 0) {
7062          ci_upper = estimate / qf_bisection(1.0 - conf_level, df_x, df_y);
7063        } else if (strcmp(alternative, "greater") == 0) {
7064          p_val = 1.0 - p_val;
7065          ci_lower = estimate / qf_bisection(conf_level, df_x, df_y);
7066        } else {
7067          // two.sided
7068          double p1 = p_val;
7069          double p2 = 1.0 - p_val;
7070          p_val = 2.0 * (p1 < p2 ? p1 : p2);
7071
7072          double beta = (1.0 - conf_level) / 2.0;
7073          ci_lower = estimate / qf_bisection(1.0 - beta, df_x, df_y);
7074          ci_upper = estimate / qf_bisection(beta, df_x, df_y);
7075        }
7076
7077        // --- Pack Results ---
7078        HV* restrict results = newHV();
7079        hv_store(results, "statistic", 9, newSVnv(statistic), 0);
7080
7081        AV* restrict param_av = newAV();
7082        av_push(param_av, newSVnv(df_x));
7083        av_push(param_av, newSVnv(df_y));
7084        hv_store(results, "parameter", 9, newRV_noinc((SV*)param_av), 0);
7085
7086        hv_store(results, "p_value", 7, newSVnv(p_val), 0);
7087
7088        AV* restrict conf_int = newAV();
7089        av_push(conf_int, newSVnv(ci_lower));
7090        av_push(conf_int, newSVnv(ci_upper));
7091        hv_store(results, "conf_int", 8, newRV_noinc((SV*)conf_int), 0);
7092
7093        hv_store(results, "estimate", 8, newSVnv(estimate), 0);
7094        hv_store(results, "null_value", 10, newSVnv(ratio), 0);
7095        hv_store(results, "alternative", 11, newSVpv(alternative, 0), 0);
7096        hv_store(results, "method", 6, newSVpv("F test to compare two variances", 0), 0);
7097
7098        RETVAL = newRV_noinc((SV*)results);
7099}
7100OUTPUT:
7101    RETVAL
7102
7103SV *sample(ref, n = 1)
7104    SV *ref
7105    IV n
7106PREINIT:
7107    SV *restrict ret = &PL_sv_undef;
7108CODE:
7109        if (!PL_srand_called) {
7110          (void)seedDrand01((Rand_seed_t)Perl_seed(aTHX));
7111          PL_srand_called = TRUE;
7112        }
7113        if (n < 0) n = 0;
7114        if (SvROK(ref)) {
7115                SV *restrict rv = SvRV(ref);
7116                /* --- HASH REFERENCE --- */
7117                if (SvTYPE(rv) == SVt_PVHV) {
7118                        HV *restrict hv    = (HV *)rv;
7119                        I32 count = hv_iterinit(hv);
7120                        I32 limit = (n < (IV)count) ? (I32)n : count;
7121                        HV *restrict ret_hv = newHV();
7122
7123                        if (count > 0 && limit > 0) {
7124                                 HE **restrict entries;
7125                                 HE  *restrict entry;
7126                                 unsigned i;
7127
7128                                 Newx(entries, count, HE *);
7129
7130                                 /* Collect all HE pointers in one pass */
7131                                 i = 0;
7132                                 while ((entry = hv_iternext(hv)))
7133                                     entries[i++] = entry;
7134
7135                                 /* Partial Fisher-Yates (only 'limit' passes) */
7136                                 for (i = 0; i < limit; i++) {
7137                                     I32 j    = i + (I32)(Drand01() * (count - i));
7138                                     HE *restrict tmp  = entries[i];
7139                                     entries[i] = entries[j];
7140                                     entries[j] = tmp;
7141                                 }
7142
7143                                 /* Pre-size result hash to avoid rehashing during population */
7144                                 hv_ksplit(ret_hv, limit);
7145
7146                                 for (i = 0; i < limit; i++) {
7147                                     HEK *restrict hek = HeKEY_hek(entries[i]);
7148                                     /*
7149                                      * hv_store() with a precomputed hash skips the hash
7150                                      * computation entirely.  Negative klen signals UTF-8.
7151                                      */
7152                                     (void)hv_store(
7153                                         ret_hv,
7154                                         HEK_KEY(hek),
7155                                         HEK_UTF8(hek) ? -(I32)HEK_LEN(hek) : (I32)HEK_LEN(hek),
7156                                         SvREFCNT_inc(HeVAL(entries[i])),  /* HeVAL: direct macro, no call */
7157                                         HeHASH(entries[i])                /* reuse precomputed hash */
7158                                     );
7159                                 }
7160                                 Safefree(entries);
7161                        }
7162                        ret = newRV_noinc((SV *)ret_hv);
7163                } else if (SvTYPE(rv) == SVt_PVAV) {/* --- ARRAY REFERENCE --- */
7164                        AV    *restrict av    = (AV *)rv;
7165                        size_t count = av_top_index(av) + 1;  /* signed; 0 for empty AV */
7166                        size_t limit = (n < count) ? (size_t)n : count;
7167                        AV    *restrict ret_av = newAV();
7168
7169                        /* Pre-allocate the result array to avoid incremental reallocs */
7170                        if (n > 0)
7171                                 av_extend(ret_av, (size_t)n - 1);
7172
7173                        if (count > 0) {
7174                                 SV    **restrict src = AvARRAY(av);   /* direct pointer into AV's C array */
7175                                 size_t *restrict idx;
7176                                 size_t  i;
7177
7178                                 /* Shuffle indices rather than SV** to keep the original AV intact */
7179                                 Newx(idx, count, size_t);
7180                                 for (i = 0; i < count; i++)
7181                                     idx[i] = i;
7182
7183                                 /* Partial Fisher-Yates on the index array */
7184                                 for (i = 0; i < limit; i++) {
7185                                     size_t j   = i + (size_t)(Drand01() * (count - i));
7186                                     size_t tmp = idx[i];
7187                                     idx[i]  = idx[j];
7188                                     idx[j]  = tmp;
7189                                 }
7190
7191                                 for (i = 0; i < (size_t)n; i++) {
7192                                     if (i < limit) {
7193                                         SV *restrict sv = src[idx[i]];   /* AvARRAY direct access — no av_fetch call */
7194                                         SV *push_sv;
7195                                                        if (sv && sv != &PL_sv_undef)
7196                                                                 push_sv = SvREFCNT_inc(sv);
7197                                                        else
7198                                                                 push_sv = newSV(0);
7199                                                        av_push(ret_av, push_sv);
7200                                     } else {
7201                                         av_push(ret_av, newSV(0));
7202                                     }
7203                                 }
7204                                 Safefree(idx);
7205                        } else {
7206                                for (size_t i = 0; i < (size_t)n; i++)
7207                                    av_push(ret_av, newSV(0));
7208                        }
7209                        ret = newRV_noinc((SV *)ret_av);
7210                }
7211        }
7212        RETVAL = ret;
7213OUTPUT:
7214    RETVAL