r/interpreter/builtins/
stats.rs

1//! Statistics builtins: cor, cov, weighted.mean, scale, complete.cases, na.omit,
2//! and distribution functions (d/p/q for norm, unif, exp, gamma, beta, cauchy,
3//! weibull, lnorm, chisq, t, f, binom, pois, geom, hyper).
4
5use crate::interpreter::coerce::usize_to_f64;
6use crate::interpreter::value::*;
7use indexmap::IndexMap;
8use minir_macros::builtin;
9use std::f64::consts::{FRAC_1_SQRT_2, PI};
10
11// region: Helpers
12
13/// Extract na.rm flag from named arguments.
14fn extract_na_rm(named: &[(String, RValue)]) -> bool {
15    named.iter().any(|(n, v)| {
16        n == "na.rm" && v.as_vector().and_then(|v| v.as_logical_scalar()) == Some(true)
17    })
18}
19
20/// Extract a named f64 parameter from named args, falling back to positional.
21fn extract_param(
22    args: &[RValue],
23    named: &[(String, RValue)],
24    name: &str,
25    positional_index: usize,
26    default: f64,
27) -> f64 {
28    for (k, v) in named {
29        if k == name {
30            if let Some(rv) = v.as_vector() {
31                if let Some(d) = rv.as_double_scalar() {
32                    return d;
33                }
34            }
35        }
36    }
37    args.get(positional_index)
38        .and_then(|v| v.as_vector())
39        .and_then(|v| v.as_double_scalar())
40        .unwrap_or(default)
41}
42
43/// Extract a named bool parameter from named args, falling back to positional.
44fn extract_bool_param(
45    args: &[RValue],
46    named: &[(String, RValue)],
47    name: &str,
48    positional_index: usize,
49    default: bool,
50) -> bool {
51    for (k, v) in named {
52        if k == name {
53            if let Some(rv) = v.as_vector() {
54                if let Some(b) = rv.as_logical_scalar() {
55                    return b;
56                }
57            }
58        }
59    }
60    args.get(positional_index)
61        .and_then(|v| v.as_vector())
62        .and_then(|v| v.as_logical_scalar())
63        .unwrap_or(default)
64}
65
66// endregion
67
68// region: cov
69
70/// Sample covariance of two numeric vectors.
71///
72/// Computes sum((x - mean(x)) * (y - mean(y))) / (n - 1).
73///
74/// @param x numeric vector
75/// @param y numeric vector (same length as x)
76/// @return scalar double
77#[builtin(min_args = 2, namespace = "stats")]
78fn builtin_cov(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
79    let x_vals = args[0]
80        .as_vector()
81        .ok_or_else(|| {
82            RError::new(
83                RErrorKind::Argument,
84                "cov() requires numeric vectors".to_string(),
85            )
86        })?
87        .to_doubles();
88    let y_vals = args[1]
89        .as_vector()
90        .ok_or_else(|| {
91            RError::new(
92                RErrorKind::Argument,
93                "cov() requires numeric vectors".to_string(),
94            )
95        })?
96        .to_doubles();
97    if x_vals.len() != y_vals.len() {
98        return Err(RError::new(
99            RErrorKind::Argument,
100            format!(
101                "cov() requires vectors of equal length, got {} and {}",
102                x_vals.len(),
103                y_vals.len()
104            ),
105        ));
106    }
107
108    // Collect paired non-NA values
109    let pairs: Vec<(f64, f64)> = x_vals
110        .iter()
111        .zip(y_vals.iter())
112        .filter_map(|(x, y)| match (x, y) {
113            (Some(a), Some(b)) => Some((*a, *b)),
114            _ => None,
115        })
116        .collect();
117
118    let n = usize_to_f64(pairs.len());
119    if n < 2.0 {
120        return Ok(RValue::vec(Vector::Double(vec![Some(f64::NAN)].into())));
121    }
122    let mean_x = pairs.iter().map(|(x, _)| x).sum::<f64>() / n;
123    let mean_y = pairs.iter().map(|(_, y)| y).sum::<f64>() / n;
124    let cov = pairs
125        .iter()
126        .map(|(x, y)| (x - mean_x) * (y - mean_y))
127        .sum::<f64>()
128        / (n - 1.0);
129    Ok(RValue::vec(Vector::Double(vec![Some(cov)].into())))
130}
131
132// endregion
133
134// region: cor
135
136/// Pearson correlation coefficient of two numeric vectors.
137///
138/// Computes cov(x, y) / (sd(x) * sd(y)). Only method = "pearson" is supported.
139///
140/// @param x numeric vector
141/// @param y numeric vector (same length as x)
142/// @param method character; only "pearson" is currently supported
143/// @return scalar double
144#[builtin(min_args = 2, namespace = "stats")]
145fn builtin_cor(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
146    // Check method (only pearson supported)
147    for (k, v) in named {
148        if k == "method" {
149            if let Some(vec) = v.as_vector() {
150                if let Some(s) = vec.as_character_scalar() {
151                    if s != "pearson" {
152                        return Err(RError::new(
153                            RErrorKind::Argument,
154                            format!("cor() only supports method = \"pearson\", got \"{}\"", s),
155                        ));
156                    }
157                }
158            }
159        }
160    }
161
162    let x_vals = args[0]
163        .as_vector()
164        .ok_or_else(|| {
165            RError::new(
166                RErrorKind::Argument,
167                "cor() requires numeric vectors".to_string(),
168            )
169        })?
170        .to_doubles();
171    let y_vals = args[1]
172        .as_vector()
173        .ok_or_else(|| {
174            RError::new(
175                RErrorKind::Argument,
176                "cor() requires numeric vectors".to_string(),
177            )
178        })?
179        .to_doubles();
180    if x_vals.len() != y_vals.len() {
181        return Err(RError::new(
182            RErrorKind::Argument,
183            format!(
184                "cor() requires vectors of equal length, got {} and {}",
185                x_vals.len(),
186                y_vals.len()
187            ),
188        ));
189    }
190
191    // Collect paired non-NA values
192    let pairs: Vec<(f64, f64)> = x_vals
193        .iter()
194        .zip(y_vals.iter())
195        .filter_map(|(x, y)| match (x, y) {
196            (Some(a), Some(b)) => Some((*a, *b)),
197            _ => None,
198        })
199        .collect();
200
201    let n = usize_to_f64(pairs.len());
202    if n < 2.0 {
203        return Ok(RValue::vec(Vector::Double(vec![Some(f64::NAN)].into())));
204    }
205    let mean_x = pairs.iter().map(|(x, _)| x).sum::<f64>() / n;
206    let mean_y = pairs.iter().map(|(_, y)| y).sum::<f64>() / n;
207    let cov_xy = pairs
208        .iter()
209        .map(|(x, y)| (x - mean_x) * (y - mean_y))
210        .sum::<f64>();
211    let var_x = pairs.iter().map(|(x, _)| (x - mean_x).powi(2)).sum::<f64>();
212    let var_y = pairs.iter().map(|(_, y)| (y - mean_y).powi(2)).sum::<f64>();
213    let denom = (var_x * var_y).sqrt();
214    if denom == 0.0 {
215        return Ok(RValue::vec(Vector::Double(vec![Some(f64::NAN)].into())));
216    }
217    let r = cov_xy / denom;
218    Ok(RValue::vec(Vector::Double(vec![Some(r)].into())))
219}
220
221// endregion
222
223// region: weighted.mean
224
225/// Weighted arithmetic mean.
226///
227/// @param x numeric vector
228/// @param w numeric vector of weights (same length as x)
229/// @param na.rm logical; if TRUE, remove NAs before computing
230/// @return scalar double
231#[builtin(name = "weighted.mean", min_args = 2, namespace = "stats")]
232fn builtin_weighted_mean(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
233    let na_rm = extract_na_rm(named);
234    let x_vals = args[0]
235        .as_vector()
236        .ok_or_else(|| {
237            RError::new(
238                RErrorKind::Argument,
239                "weighted.mean() requires a numeric vector for 'x'".to_string(),
240            )
241        })?
242        .to_doubles();
243    let w_vals = args[1]
244        .as_vector()
245        .ok_or_else(|| {
246            RError::new(
247                RErrorKind::Argument,
248                "weighted.mean() requires a numeric vector for 'w'".to_string(),
249            )
250        })?
251        .to_doubles();
252    if x_vals.len() != w_vals.len() {
253        return Err(RError::new(
254            RErrorKind::Argument,
255            format!(
256                "weighted.mean() requires 'x' and 'w' of equal length, got {} and {}",
257                x_vals.len(),
258                w_vals.len()
259            ),
260        ));
261    }
262
263    let mut sum_wx = 0.0;
264    let mut sum_w = 0.0;
265    for (x, w) in x_vals.iter().zip(w_vals.iter()) {
266        match (x, w) {
267            (Some(xv), Some(wv)) => {
268                sum_wx += xv * wv;
269                sum_w += wv;
270            }
271            _ if !na_rm => {
272                return Ok(RValue::vec(Vector::Double(vec![None].into())));
273            }
274            _ => {}
275        }
276    }
277    if sum_w == 0.0 {
278        return Ok(RValue::vec(Vector::Double(vec![Some(f64::NAN)].into())));
279    }
280    Ok(RValue::vec(Vector::Double(
281        vec![Some(sum_wx / sum_w)].into(),
282    )))
283}
284
285// endregion
286
287// region: scale
288
289/// Center and/or scale a numeric vector.
290///
291/// When center = TRUE, subtracts the mean. When scale = TRUE, divides by the
292/// standard deviation. Returns a double vector with "scaled:center" and
293/// "scaled:scale" attributes.
294///
295/// @param x numeric vector
296/// @param center logical (default TRUE)
297/// @param scale logical (default TRUE)
298/// @return numeric vector
299#[builtin(min_args = 1, namespace = "stats")]
300fn builtin_scale(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
301    let do_center = extract_bool_param(args, named, "center", 1, true);
302    let do_scale = extract_bool_param(args, named, "scale", 2, true);
303
304    let vals = args[0]
305        .as_vector()
306        .ok_or_else(|| {
307            RError::new(
308                RErrorKind::Argument,
309                "scale() requires a numeric vector".to_string(),
310            )
311        })?
312        .to_doubles();
313
314    let non_na: Vec<f64> = vals.iter().copied().flatten().collect();
315    let n = usize_to_f64(non_na.len());
316
317    let center_val = if do_center && n > 0.0 {
318        non_na.iter().sum::<f64>() / n
319    } else {
320        0.0
321    };
322
323    let scale_val = if do_scale && n > 1.0 {
324        let mean = if do_center {
325            center_val
326        } else {
327            non_na.iter().sum::<f64>() / n
328        };
329        let var = non_na.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / (n - 1.0);
330        var.sqrt()
331    } else if do_scale {
332        // n <= 1, can't compute sd
333        f64::NAN
334    } else {
335        1.0
336    };
337
338    let result: Vec<Option<f64>> = vals
339        .iter()
340        .map(|x| {
341            x.map(|v| {
342                let centered = if do_center { v - center_val } else { v };
343                if do_scale && scale_val != 0.0 && !scale_val.is_nan() {
344                    centered / scale_val
345                } else {
346                    centered
347                }
348            })
349        })
350        .collect();
351
352    let mut attrs: IndexMap<String, RValue> = IndexMap::new();
353    if do_center {
354        attrs.insert(
355            "scaled:center".to_string(),
356            RValue::vec(Vector::Double(vec![Some(center_val)].into())),
357        );
358    }
359    if do_scale {
360        attrs.insert(
361            "scaled:scale".to_string(),
362            RValue::vec(Vector::Double(vec![Some(scale_val)].into())),
363        );
364    }
365
366    Ok(RValue::Vector(RVector {
367        inner: Vector::Double(result.into()),
368        attrs: if attrs.is_empty() {
369            None
370        } else {
371            Some(Box::new(attrs))
372        },
373    }))
374}
375
376// endregion
377
378// region: complete.cases
379
380/// Return a logical vector indicating which "rows" have no NA values.
381///
382/// For vectors, returns TRUE for each non-NA element.
383/// For multiple arguments (conceptually columns of a data frame), returns TRUE
384/// for positions where all arguments are non-NA.
385///
386/// @param ... one or more vectors
387/// @return logical vector
388#[builtin(name = "complete.cases", min_args = 1, namespace = "stats")]
389fn builtin_complete_cases(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
390    // Determine length from first argument
391    let first_vec = args[0].as_vector().ok_or_else(|| {
392        RError::new(
393            RErrorKind::Argument,
394            "complete.cases() requires vector arguments".to_string(),
395        )
396    })?;
397    let n = first_vec.len();
398
399    // Collect all vectors
400    let mut cols: Vec<Vec<bool>> = Vec::with_capacity(args.len());
401    for arg in args {
402        let vec = arg.as_vector().ok_or_else(|| {
403            RError::new(
404                RErrorKind::Argument,
405                "complete.cases() requires vector arguments".to_string(),
406            )
407        })?;
408        if vec.len() != n {
409            return Err(RError::new(
410                RErrorKind::Argument,
411                "complete.cases() requires all arguments to have the same length".to_string(),
412            ));
413        }
414        let is_na = is_na_vec(vec);
415        cols.push(is_na);
416    }
417
418    // A row is complete if no column has NA at that position
419    let result: Vec<Option<bool>> = (0..n)
420        .map(|i| Some(!cols.iter().any(|col| col[i])))
421        .collect();
422    Ok(RValue::vec(Vector::Logical(result.into())))
423}
424
425/// Returns a Vec<bool> where true means the element is NA.
426fn is_na_vec(v: &Vector) -> Vec<bool> {
427    match v {
428        Vector::Logical(vals) => vals.iter().map(|x| x.is_none()).collect(),
429        Vector::Integer(vals) => vals.iter().map(|x| x.is_none()).collect(),
430        Vector::Double(vals) => vals
431            .iter()
432            .map(|x| x.is_none() || x.map(|f| f.is_nan()).unwrap_or(false))
433            .collect(),
434        Vector::Complex(vals) => vals.iter().map(|x| x.is_none()).collect(),
435        Vector::Character(vals) => vals.iter().map(|x| x.is_none()).collect(),
436        Vector::Raw(vals) => vals.iter().map(|_| false).collect(),
437    }
438}
439
440// endregion
441
442// region: na.omit
443
444/// Remove NA values from a vector.
445///
446/// Returns a new vector with all NA elements removed. Sets the "na.action"
447/// attribute to the indices of removed elements.
448///
449/// @param object vector
450/// @return vector with NAs removed
451#[builtin(name = "na.omit", min_args = 1, namespace = "stats")]
452fn builtin_na_omit(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
453    let vec = args[0].as_vector().ok_or_else(|| {
454        RError::new(
455            RErrorKind::Argument,
456            "na.omit() requires a vector argument".to_string(),
457        )
458    })?;
459
460    let na_flags = is_na_vec(vec);
461    let na_indices: Vec<Option<i64>> = na_flags
462        .iter()
463        .enumerate()
464        .filter(|(_, &is_na)| is_na)
465        .map(|(i, _)| {
466            // R uses 1-based indexing
467            i64::try_from(i + 1).ok()
468        })
469        .collect();
470
471    let result = filter_non_na(vec, &na_flags);
472
473    let mut attrs: IndexMap<String, RValue> = IndexMap::new();
474    if !na_indices.is_empty() {
475        attrs.insert(
476            "na.action".to_string(),
477            RValue::vec(Vector::Integer(na_indices.into())),
478        );
479    }
480
481    Ok(RValue::Vector(RVector {
482        inner: result,
483        attrs: if attrs.is_empty() {
484            None
485        } else {
486            Some(Box::new(attrs))
487        },
488    }))
489}
490
491/// Filter a vector, keeping only non-NA elements.
492fn filter_non_na(v: &Vector, na_flags: &[bool]) -> Vector {
493    match v {
494        Vector::Logical(vals) => Vector::Logical(
495            vals.iter()
496                .zip(na_flags)
497                .filter(|(_, &is_na)| !is_na)
498                .map(|(x, _)| *x)
499                .collect::<Vec<_>>()
500                .into(),
501        ),
502        Vector::Integer(vals) => Vector::Integer(
503            vals.iter_opt()
504                .zip(na_flags)
505                .filter(|(_, &is_na)| !is_na)
506                .map(|(x, _)| x)
507                .collect::<Vec<_>>()
508                .into(),
509        ),
510        Vector::Double(vals) => Vector::Double(
511            vals.iter_opt()
512                .zip(na_flags)
513                .filter(|(_, &is_na)| !is_na)
514                .map(|(x, _)| x)
515                .collect::<Vec<_>>()
516                .into(),
517        ),
518        Vector::Complex(vals) => Vector::Complex(
519            vals.iter()
520                .zip(na_flags)
521                .filter(|(_, &is_na)| !is_na)
522                .map(|(x, _)| *x)
523                .collect::<Vec<_>>()
524                .into(),
525        ),
526        Vector::Character(vals) => Vector::Character(
527            vals.iter()
528                .zip(na_flags)
529                .filter(|(_, &is_na)| !is_na)
530                .map(|(x, _)| x.clone())
531                .collect::<Vec<_>>()
532                .into(),
533        ),
534        Vector::Raw(vals) => Vector::Raw(
535            vals.iter()
536                .zip(na_flags)
537                .filter(|(_, &is_na)| !is_na)
538                .map(|(x, _)| *x)
539                .collect(),
540        ),
541    }
542}
543
544// endregion
545
546// region: Distribution parameter helpers
547
548/// Extract the `log` flag from named arguments (for d* density functions).
549fn extract_log_flag(named: &[(String, RValue)]) -> bool {
550    named
551        .iter()
552        .find(|(n, _)| n == "log")
553        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
554        .unwrap_or(false)
555}
556
557/// Extract the `lower.tail` flag from named arguments (for p*/q* functions).
558fn extract_lower_tail(named: &[(String, RValue)]) -> bool {
559    named
560        .iter()
561        .find(|(n, _)| n == "lower.tail")
562        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
563        .unwrap_or(true)
564}
565
566/// Extract the `log.p` flag from named arguments (for p*/q* functions).
567fn extract_log_p(named: &[(String, RValue)]) -> bool {
568    named
569        .iter()
570        .find(|(n, _)| n == "log.p")
571        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
572        .unwrap_or(false)
573}
574
575/// Post-process a density value: apply log if requested.
576fn apply_d_log(val: f64, log_flag: bool) -> f64 {
577    if log_flag {
578        val.ln()
579    } else {
580        val
581    }
582}
583
584/// Post-process a probability value: apply lower.tail and log.p.
585fn apply_p_flags(val: f64, lower_tail: bool, log_p: bool) -> f64 {
586    let p = if lower_tail { val } else { 1.0 - val };
587    if log_p {
588        p.ln()
589    } else {
590        p
591    }
592}
593
594/// Pre-process a probability input for quantile functions: undo log.p and lower.tail.
595fn apply_q_flags(p: f64, lower_tail: bool, log_p: bool) -> f64 {
596    let p = if log_p { p.exp() } else { p };
597    if lower_tail {
598        p
599    } else {
600        1.0 - p
601    }
602}
603
604// endregion
605
606// region: Normal distribution (dnorm, pnorm, qnorm)
607
608/// Standard normal PDF: exp(-x^2/2) / sqrt(2*pi)
609fn std_normal_pdf(x: f64) -> f64 {
610    (-0.5 * x * x).exp() / (2.0 * PI).sqrt()
611}
612
613/// Standard normal CDF using the error function.
614/// pnorm(x) = 0.5 * erfc(-x / sqrt(2))
615fn std_normal_cdf(x: f64) -> f64 {
616    0.5 * erfc(-x * FRAC_1_SQRT_2)
617}
618
619/// Complementary error function — delegates to libm for machine-precision accuracy.
620fn erfc(x: f64) -> f64 {
621    libm::erfc(x)
622}
623
624/// Inverse standard normal CDF (quantile function).
625///
626/// Uses the rational approximation from Peter Acklam (2003).
627/// Accurate to roughly 1.15e-9 in the central region.
628fn std_normal_quantile(p: f64) -> f64 {
629    if p <= 0.0 {
630        return f64::NEG_INFINITY;
631    }
632    if p >= 1.0 {
633        return f64::INFINITY;
634    }
635    if (p - 0.5).abs() < f64::EPSILON {
636        return 0.0;
637    }
638
639    // Coefficients for the rational approximation
640    const A: [f64; 6] = [
641        -3.969_683_028_665_376e1,
642        2.209_460_984_245_205e2,
643        -2.759_285_104_469_687e2,
644        1.383_577_518_672_69e2,
645        -3.066_479_806_614_716e1,
646        2.506_628_277_459_239,
647    ];
648    const B: [f64; 5] = [
649        -5.447_609_879_822_406e1,
650        1.615_858_368_580_409e2,
651        -1.556_989_798_598_866e2,
652        6.680_131_188_771_972e1,
653        -1.328_068_155_288_572e1,
654    ];
655    const C: [f64; 6] = [
656        -7.784_894_002_430_293e-3,
657        -3.223_964_580_411_365e-1,
658        -2.400_758_277_161_838,
659        -2.549_732_539_343_734,
660        4.374_664_141_464_968,
661        2.938_163_982_698_783,
662    ];
663    const D: [f64; 4] = [
664        7.784_695_709_041_462e-3,
665        3.224_671_290_700_398e-1,
666        2.445_134_137_142_996,
667        3.754_408_661_907_416,
668    ];
669
670    const P_LOW: f64 = 0.02425;
671    const P_HIGH: f64 = 1.0 - P_LOW;
672
673    if p < P_LOW {
674        // Lower tail
675        let q = (-2.0 * p.ln()).sqrt();
676        (((((C[0] * q + C[1]) * q + C[2]) * q + C[3]) * q + C[4]) * q + C[5])
677            / ((((D[0] * q + D[1]) * q + D[2]) * q + D[3]) * q + 1.0)
678    } else if p <= P_HIGH {
679        // Central region
680        let q = p - 0.5;
681        let r = q * q;
682        (((((A[0] * r + A[1]) * r + A[2]) * r + A[3]) * r + A[4]) * r + A[5]) * q
683            / (((((B[0] * r + B[1]) * r + B[2]) * r + B[3]) * r + B[4]) * r + 1.0)
684    } else {
685        // Upper tail
686        let q = (-2.0 * (1.0 - p).ln()).sqrt();
687        -(((((C[0] * q + C[1]) * q + C[2]) * q + C[3]) * q + C[4]) * q + C[5])
688            / ((((D[0] * q + D[1]) * q + D[2]) * q + D[3]) * q + 1.0)
689    }
690}
691
692/// Normal density function.
693///
694/// @param x quantile vector
695/// @param mean mean of the distribution (default 0)
696/// @param sd standard deviation (default 1)
697/// @param log logical; if TRUE, return log-density (default FALSE)
698/// @return numeric vector of densities
699#[builtin(min_args = 1, namespace = "stats")]
700fn builtin_dnorm(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
701    let mean = extract_param(args, named, "mean", 1, 0.0);
702    let sd = extract_param(args, named, "sd", 2, 1.0);
703    let log_flag = extract_log_flag(named);
704    if sd < 0.0 {
705        return Err(RError::new(
706            RErrorKind::Argument,
707            "dnorm(): 'sd' must be non-negative".to_string(),
708        ));
709    }
710    let vals = args[0]
711        .as_vector()
712        .ok_or_else(|| {
713            RError::new(
714                RErrorKind::Argument,
715                "dnorm() requires a numeric vector".to_string(),
716            )
717        })?
718        .to_doubles();
719    let result: Vec<Option<f64>> = vals
720        .iter()
721        .map(|x| {
722            x.map(|v| {
723                let d = if sd == 0.0 {
724                    if v == mean {
725                        f64::INFINITY
726                    } else {
727                        0.0
728                    }
729                } else {
730                    let z = (v - mean) / sd;
731                    std_normal_pdf(z) / sd
732                };
733                apply_d_log(d, log_flag)
734            })
735        })
736        .collect();
737    Ok(RValue::vec(Vector::Double(result.into())))
738}
739
740/// Normal cumulative distribution function.
741///
742/// @param q quantile vector
743/// @param mean mean of the distribution (default 0)
744/// @param sd standard deviation (default 1)
745/// @param lower.tail logical; if TRUE (default), return P(X <= q), else P(X > q)
746/// @param log.p logical; if TRUE, return log-probability (default FALSE)
747/// @return numeric vector of probabilities
748#[builtin(min_args = 1, namespace = "stats")]
749fn builtin_pnorm(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
750    let mean = extract_param(args, named, "mean", 1, 0.0);
751    let sd = extract_param(args, named, "sd", 2, 1.0);
752    let lower_tail = extract_lower_tail(named);
753    let log_p = extract_log_p(named);
754    if sd < 0.0 {
755        return Err(RError::new(
756            RErrorKind::Argument,
757            "pnorm(): 'sd' must be non-negative".to_string(),
758        ));
759    }
760    let vals = args[0]
761        .as_vector()
762        .ok_or_else(|| {
763            RError::new(
764                RErrorKind::Argument,
765                "pnorm() requires a numeric vector".to_string(),
766            )
767        })?
768        .to_doubles();
769    let result: Vec<Option<f64>> = vals
770        .iter()
771        .map(|x| {
772            x.map(|v| {
773                let p = if sd == 0.0 {
774                    if v < mean {
775                        0.0
776                    } else {
777                        1.0
778                    }
779                } else {
780                    let z = (v - mean) / sd;
781                    std_normal_cdf(z)
782                };
783                apply_p_flags(p, lower_tail, log_p)
784            })
785        })
786        .collect();
787    Ok(RValue::vec(Vector::Double(result.into())))
788}
789
790/// Normal quantile function (inverse CDF).
791///
792/// @param p probability vector
793/// @param mean mean of the distribution (default 0)
794/// @param sd standard deviation (default 1)
795/// @param lower.tail logical; if TRUE (default), p is P(X <= q), else P(X > q)
796/// @param log.p logical; if TRUE, p is given as log(p) (default FALSE)
797/// @return numeric vector of quantiles
798#[builtin(min_args = 1, namespace = "stats")]
799fn builtin_qnorm(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
800    let mean = extract_param(args, named, "mean", 1, 0.0);
801    let sd = extract_param(args, named, "sd", 2, 1.0);
802    let lower_tail = extract_lower_tail(named);
803    let log_p = extract_log_p(named);
804    if sd < 0.0 {
805        return Err(RError::new(
806            RErrorKind::Argument,
807            "qnorm(): 'sd' must be non-negative".to_string(),
808        ));
809    }
810    let vals = args[0]
811        .as_vector()
812        .ok_or_else(|| {
813            RError::new(
814                RErrorKind::Argument,
815                "qnorm() requires a numeric vector".to_string(),
816            )
817        })?
818        .to_doubles();
819    let result: Vec<Option<f64>> = vals
820        .iter()
821        .map(|x| {
822            x.map(|raw_p| {
823                let p = apply_q_flags(raw_p, lower_tail, log_p);
824                if !(0.0..=1.0).contains(&p) {
825                    f64::NAN
826                } else {
827                    std_normal_quantile(p) * sd + mean
828                }
829            })
830        })
831        .collect();
832    Ok(RValue::vec(Vector::Double(result.into())))
833}
834
835// endregion
836
837// region: Uniform distribution (dunif, punif, qunif)
838
839/// Uniform density function.
840///
841/// @param x quantile vector
842/// @param min lower limit (default 0)
843/// @param max upper limit (default 1)
844/// @return numeric vector of densities
845#[builtin(min_args = 1, namespace = "stats")]
846fn builtin_dunif(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
847    let min = extract_param(args, named, "min", 1, 0.0);
848    let max = extract_param(args, named, "max", 2, 1.0);
849    let log_flag = extract_log_flag(named);
850    if min > max {
851        return Err(RError::new(
852            RErrorKind::Argument,
853            "dunif(): 'min' must not be greater than 'max'".to_string(),
854        ));
855    }
856    let vals = args[0]
857        .as_vector()
858        .ok_or_else(|| {
859            RError::new(
860                RErrorKind::Argument,
861                "dunif() requires a numeric vector".to_string(),
862            )
863        })?
864        .to_doubles();
865    let result: Vec<Option<f64>> = vals
866        .iter()
867        .map(|x| {
868            x.map(|v| {
869                let d = if min == max {
870                    if v == min {
871                        f64::INFINITY
872                    } else {
873                        0.0
874                    }
875                } else if v >= min && v <= max {
876                    1.0 / (max - min)
877                } else {
878                    0.0
879                };
880                apply_d_log(d, log_flag)
881            })
882        })
883        .collect();
884    Ok(RValue::vec(Vector::Double(result.into())))
885}
886
887/// Uniform cumulative distribution function.
888///
889/// @param q quantile vector
890/// @param min lower limit (default 0)
891/// @param max upper limit (default 1)
892/// @return numeric vector of probabilities
893#[builtin(min_args = 1, namespace = "stats")]
894fn builtin_punif(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
895    let min = extract_param(args, named, "min", 1, 0.0);
896    let max = extract_param(args, named, "max", 2, 1.0);
897    let lower_tail = extract_lower_tail(named);
898    let log_p = extract_log_p(named);
899    if min > max {
900        return Err(RError::new(
901            RErrorKind::Argument,
902            "punif(): 'min' must not be greater than 'max'".to_string(),
903        ));
904    }
905    let vals = args[0]
906        .as_vector()
907        .ok_or_else(|| {
908            RError::new(
909                RErrorKind::Argument,
910                "punif() requires a numeric vector".to_string(),
911            )
912        })?
913        .to_doubles();
914    let result: Vec<Option<f64>> = vals
915        .iter()
916        .map(|x| {
917            x.map(|v| {
918                let p = if min == max {
919                    if v < min {
920                        0.0
921                    } else {
922                        1.0
923                    }
924                } else if v <= min {
925                    0.0
926                } else if v >= max {
927                    1.0
928                } else {
929                    (v - min) / (max - min)
930                };
931                apply_p_flags(p, lower_tail, log_p)
932            })
933        })
934        .collect();
935    Ok(RValue::vec(Vector::Double(result.into())))
936}
937
938/// Uniform quantile function (inverse CDF).
939///
940/// @param p probability vector
941/// @param min lower limit (default 0)
942/// @param max upper limit (default 1)
943/// @return numeric vector of quantiles
944#[builtin(min_args = 1, namespace = "stats")]
945fn builtin_qunif(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
946    let min = extract_param(args, named, "min", 1, 0.0);
947    let max = extract_param(args, named, "max", 2, 1.0);
948    let lower_tail = extract_lower_tail(named);
949    let log_p = extract_log_p(named);
950    if min > max {
951        return Err(RError::new(
952            RErrorKind::Argument,
953            "qunif(): 'min' must not be greater than 'max'".to_string(),
954        ));
955    }
956    let vals = args[0]
957        .as_vector()
958        .ok_or_else(|| {
959            RError::new(
960                RErrorKind::Argument,
961                "qunif() requires a numeric vector".to_string(),
962            )
963        })?
964        .to_doubles();
965    let result: Vec<Option<f64>> = vals
966        .iter()
967        .map(|x| {
968            x.map(|raw_p| {
969                let p = apply_q_flags(raw_p, lower_tail, log_p);
970                if !(0.0..=1.0).contains(&p) {
971                    f64::NAN
972                } else {
973                    min + p * (max - min)
974                }
975            })
976        })
977        .collect();
978    Ok(RValue::vec(Vector::Double(result.into())))
979}
980
981// endregion
982
983// region: Mathematical helpers for distribution functions
984
985/// Log of the gamma function (via libm).
986fn ln_gamma(x: f64) -> f64 {
987    libm::lgamma(x)
988}
989
990/// Log of the binomial coefficient: lchoose(n, k) = lgamma(n+1) - lgamma(k+1) - lgamma(n-k+1).
991fn lchoose(n: f64, k: f64) -> f64 {
992    ln_gamma(n + 1.0) - ln_gamma(k + 1.0) - ln_gamma(n - k + 1.0)
993}
994
995/// Regularized lower incomplete gamma function P(a, x) = gamma(a, x) / Gamma(a).
996/// Uses series expansion for x < a+1, continued fraction otherwise.
997fn regularized_gamma_p(a: f64, x: f64) -> f64 {
998    if x <= 0.0 {
999        return 0.0;
1000    }
1001    if a <= 0.0 {
1002        return 1.0;
1003    }
1004    if x < a + 1.0 {
1005        gamma_series(a, x)
1006    } else {
1007        1.0 - gamma_cf_lentz(a, x, ln_gamma(a))
1008    }
1009}
1010
1011/// Series expansion for the regularized incomplete gamma function.
1012fn gamma_series(a: f64, x: f64) -> f64 {
1013    let ln_gamma_a = ln_gamma(a);
1014    let mut sum = 1.0 / a;
1015    let mut term = 1.0 / a;
1016    for n in 1..200 {
1017        let nf = f64::from(n);
1018        term *= x / (a + nf);
1019        sum += term;
1020        if term.abs() < sum.abs() * 1e-15 {
1021            break;
1022        }
1023    }
1024    sum * (-x + a * x.ln() - ln_gamma_a).exp()
1025}
1026
1027/// Lentz's algorithm for the continued fraction representation of Q(a, x).
1028fn gamma_cf_lentz(a: f64, x: f64, ln_gamma_a: f64) -> f64 {
1029    // CF for Q(a, x): b_0 = x+1-a, a_n = n*(a-n), b_n = x+2n+1-a
1030    let b0 = x + 1.0 - a;
1031    let mut f = if b0.abs() < 1e-30 { 1e-30 } else { b0 };
1032    let mut c = f;
1033    let mut d = 0.0;
1034
1035    for n in 1..200 {
1036        let nf = f64::from(n);
1037        let an = nf * (a - nf);
1038        let bn = x + 2.0 * nf + 1.0 - a;
1039
1040        d = bn + an * d;
1041        if d.abs() < 1e-30 {
1042            d = 1e-30;
1043        }
1044        d = 1.0 / d;
1045
1046        c = bn + an / c;
1047        if c.abs() < 1e-30 {
1048            c = 1e-30;
1049        }
1050
1051        let delta = c * d;
1052        f *= delta;
1053
1054        if (delta - 1.0).abs() < 1e-15 {
1055            break;
1056        }
1057    }
1058
1059    (-x + a * x.ln() - ln_gamma_a).exp() / f
1060}
1061
1062/// Regularized incomplete beta function I_x(a, b) using the NR continued fraction.
1063fn regularized_beta(x: f64, a: f64, b: f64) -> f64 {
1064    if x <= 0.0 {
1065        return 0.0;
1066    }
1067    if x >= 1.0 {
1068        return 1.0;
1069    }
1070    // Use the symmetry relation when x > (a+1)/(a+b+2) for better convergence
1071    if x > (a + 1.0) / (a + b + 2.0) {
1072        return 1.0 - regularized_beta(1.0 - x, b, a);
1073    }
1074    let log_prefix = a * x.ln() + b * (1.0 - x).ln() - ln_gamma(a) - ln_gamma(b) + ln_gamma(a + b);
1075    let prefix = log_prefix.exp();
1076
1077    prefix * betacf(a, b, x) / a
1078}
1079
1080/// Continued fraction for the incomplete beta function (Numerical Recipes, 3rd ed).
1081///
1082/// Evaluates the CF representation of I_x(a,b) using the modified Lentz method.
1083fn betacf(a: f64, b: f64, x: f64) -> f64 {
1084    const TINY: f64 = 1e-30;
1085    const EPS: f64 = 1e-14;
1086
1087    let qab = a + b;
1088    let qap = a + 1.0;
1089    let qam = a - 1.0;
1090
1091    let mut c = 1.0_f64;
1092    let mut d = 1.0 - qab * x / qap;
1093    if d.abs() < TINY {
1094        d = TINY;
1095    }
1096    d = 1.0 / d;
1097    let mut h = d;
1098
1099    for m in 1..300i32 {
1100        let mf = f64::from(m);
1101        let m2f = f64::from(2 * m);
1102
1103        // Even step coefficient: m(b-m)x / ((a+2m-1)(a+2m))
1104        let aa = mf * (b - mf) * x / ((qam + m2f) * (a + m2f));
1105        d = 1.0 + aa * d;
1106        if d.abs() < TINY {
1107            d = TINY;
1108        }
1109        d = 1.0 / d;
1110        c = 1.0 + aa / c;
1111        if c.abs() < TINY {
1112            c = TINY;
1113        }
1114        h *= d * c;
1115
1116        // Odd step coefficient: -(a+m)(a+b+m)x / ((a+2m)(a+2m+1))
1117        let aa = -(a + mf) * (qab + mf) * x / ((a + m2f) * (qap + m2f));
1118        d = 1.0 + aa * d;
1119        if d.abs() < TINY {
1120            d = TINY;
1121        }
1122        d = 1.0 / d;
1123        c = 1.0 + aa / c;
1124        if c.abs() < TINY {
1125            c = TINY;
1126        }
1127        let del = d * c;
1128        h *= del;
1129
1130        if (del - 1.0).abs() < EPS {
1131            break;
1132        }
1133    }
1134
1135    h
1136}
1137
1138/// Bisection-based quantile finder given a CDF.
1139/// Finds x such that cdf(x) = p.
1140fn quantile_bisect(p: f64, mut lo: f64, mut hi: f64, cdf: impl Fn(f64) -> f64) -> f64 {
1141    if p <= 0.0 {
1142        return lo;
1143    }
1144    if p >= 1.0 {
1145        return hi;
1146    }
1147    // Widen bounds if needed
1148    while cdf(lo) > p {
1149        lo = if lo <= 0.0 { lo - 1.0 } else { lo * 0.5 };
1150        if lo < -1e15 {
1151            return f64::NEG_INFINITY;
1152        }
1153    }
1154    while cdf(hi) < p {
1155        hi *= 2.0;
1156        hi += 1.0;
1157        if hi > 1e15 {
1158            return f64::INFINITY;
1159        }
1160    }
1161    for _ in 0..100 {
1162        let mid = 0.5 * (lo + hi);
1163        if (hi - lo).abs() < 1e-12 * (1.0 + mid.abs()) {
1164            return mid;
1165        }
1166        if cdf(mid) < p {
1167            lo = mid;
1168        } else {
1169            hi = mid;
1170        }
1171    }
1172    0.5 * (lo + hi)
1173}
1174
1175/// Bisection-based quantile finder for discrete distributions.
1176/// Finds smallest integer x such that cdf(x) >= p, searching in [lo, hi].
1177fn quantile_bisect_discrete(p: f64, lo: i64, hi: i64, cdf: impl Fn(i64) -> f64) -> f64 {
1178    if p <= 0.0 {
1179        return lo as f64;
1180    }
1181    if p >= 1.0 {
1182        return hi as f64;
1183    }
1184    let (mut left, mut right) = (lo, hi);
1185    while left < right {
1186        let mid = left + (right - left) / 2;
1187        if cdf(mid) < p {
1188            left = mid + 1;
1189        } else {
1190            right = mid;
1191        }
1192    }
1193    left as f64
1194}
1195
1196/// Helper to vectorize a distribution function over the first argument.
1197fn dist_vectorize(args: &[RValue], fname: &str, f: impl Fn(f64) -> f64) -> Result<RValue, RError> {
1198    let vals = args[0]
1199        .as_vector()
1200        .ok_or_else(|| {
1201            RError::new(
1202                RErrorKind::Argument,
1203                format!("{fname}() requires a numeric vector"),
1204            )
1205        })?
1206        .to_doubles();
1207    let result: Vec<Option<f64>> = vals.iter().map(|x| x.map(&f)).collect();
1208    Ok(RValue::vec(Vector::Double(result.into())))
1209}
1210
1211/// Like `dist_vectorize` but applies `log` post-processing for density functions.
1212fn dist_vectorize_d(
1213    args: &[RValue],
1214    fname: &str,
1215    log_flag: bool,
1216    f: impl Fn(f64) -> f64,
1217) -> Result<RValue, RError> {
1218    dist_vectorize(args, fname, |x| apply_d_log(f(x), log_flag))
1219}
1220
1221/// Like `dist_vectorize` but applies `lower.tail`/`log.p` post-processing for CDF functions.
1222fn dist_vectorize_p(
1223    args: &[RValue],
1224    fname: &str,
1225    lower_tail: bool,
1226    log_p: bool,
1227    f: impl Fn(f64) -> f64,
1228) -> Result<RValue, RError> {
1229    dist_vectorize(args, fname, |x| apply_p_flags(f(x), lower_tail, log_p))
1230}
1231
1232/// Like `dist_vectorize` but applies `lower.tail`/`log.p` pre-processing for quantile functions.
1233fn dist_vectorize_q(
1234    args: &[RValue],
1235    fname: &str,
1236    lower_tail: bool,
1237    log_p: bool,
1238    f: impl Fn(f64) -> f64,
1239) -> Result<RValue, RError> {
1240    dist_vectorize(args, fname, |raw_p| {
1241        f(apply_q_flags(raw_p, lower_tail, log_p))
1242    })
1243}
1244
1245// endregion
1246
1247// region: Exponential distribution (dexp, pexp, qexp)
1248
1249/// Exponential density function.
1250///
1251/// @param x quantile vector
1252/// @param rate rate parameter (default 1)
1253/// @return numeric vector of densities
1254#[builtin(min_args = 1, namespace = "stats")]
1255fn builtin_dexp(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1256    let rate = extract_param(args, named, "rate", 1, 1.0);
1257    let log_flag = extract_log_flag(named);
1258    if rate <= 0.0 {
1259        return Err(RError::new(
1260            RErrorKind::Argument,
1261            "dexp(): 'rate' must be positive".to_string(),
1262        ));
1263    }
1264    dist_vectorize_d(args, "dexp", log_flag, |x| {
1265        if x < 0.0 {
1266            0.0
1267        } else {
1268            rate * (-rate * x).exp()
1269        }
1270    })
1271}
1272
1273/// Exponential cumulative distribution function.
1274///
1275/// @param q quantile vector
1276/// @param rate rate parameter (default 1)
1277/// @return numeric vector of probabilities
1278#[builtin(min_args = 1, namespace = "stats")]
1279fn builtin_pexp(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1280    let rate = extract_param(args, named, "rate", 1, 1.0);
1281    let lower_tail = extract_lower_tail(named);
1282    let log_p = extract_log_p(named);
1283    if rate <= 0.0 {
1284        return Err(RError::new(
1285            RErrorKind::Argument,
1286            "pexp(): 'rate' must be positive".to_string(),
1287        ));
1288    }
1289    dist_vectorize_p(args, "pexp", lower_tail, log_p, |q| {
1290        if q < 0.0 {
1291            0.0
1292        } else {
1293            1.0 - (-rate * q).exp()
1294        }
1295    })
1296}
1297
1298/// Exponential quantile function (inverse CDF).
1299///
1300/// @param p probability vector
1301/// @param rate rate parameter (default 1)
1302/// @return numeric vector of quantiles
1303#[builtin(min_args = 1, namespace = "stats")]
1304fn builtin_qexp(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1305    let rate = extract_param(args, named, "rate", 1, 1.0);
1306    let lower_tail = extract_lower_tail(named);
1307    let log_p = extract_log_p(named);
1308    if rate <= 0.0 {
1309        return Err(RError::new(
1310            RErrorKind::Argument,
1311            "qexp(): 'rate' must be positive".to_string(),
1312        ));
1313    }
1314    dist_vectorize_q(args, "qexp", lower_tail, log_p, |p| {
1315        if !(0.0..=1.0).contains(&p) {
1316            f64::NAN
1317        } else if p == 1.0 {
1318            f64::INFINITY
1319        } else {
1320            -(1.0 - p).ln() / rate
1321        }
1322    })
1323}
1324
1325// endregion
1326
1327// region: Gamma distribution (dgamma, pgamma, qgamma)
1328
1329/// Gamma density function.
1330///
1331/// @param x quantile vector
1332/// @param shape shape parameter
1333/// @param rate rate parameter (default 1); scale = 1/rate
1334/// @return numeric vector of densities
1335#[builtin(min_args = 2, namespace = "stats")]
1336fn builtin_dgamma(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1337    let shape = extract_param(args, named, "shape", 1, f64::NAN);
1338    let rate = extract_param(args, named, "rate", 2, 1.0);
1339    let log_flag = extract_log_flag(named);
1340    if shape <= 0.0 || rate <= 0.0 || shape.is_nan() {
1341        return Err(RError::new(
1342            RErrorKind::Argument,
1343            "dgamma(): 'shape' and 'rate' must be positive".to_string(),
1344        ));
1345    }
1346    let scale = 1.0 / rate;
1347    let log_norm = shape * scale.ln() + ln_gamma(shape);
1348    dist_vectorize_d(args, "dgamma", log_flag, |x| {
1349        if x < 0.0 {
1350            0.0
1351        } else if x == 0.0 {
1352            if shape < 1.0 {
1353                f64::INFINITY
1354            } else if shape == 1.0 {
1355                rate
1356            } else {
1357                0.0
1358            }
1359        } else {
1360            ((shape - 1.0) * x.ln() - x / scale - log_norm).exp()
1361        }
1362    })
1363}
1364
1365/// Gamma cumulative distribution function.
1366///
1367/// Uses the regularized incomplete gamma function.
1368///
1369/// @param q quantile vector
1370/// @param shape shape parameter
1371/// @param rate rate parameter (default 1)
1372/// @return numeric vector of probabilities
1373#[builtin(min_args = 2, namespace = "stats")]
1374fn builtin_pgamma(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1375    let shape = extract_param(args, named, "shape", 1, f64::NAN);
1376    let rate = extract_param(args, named, "rate", 2, 1.0);
1377    let lower_tail = extract_lower_tail(named);
1378    let log_p = extract_log_p(named);
1379    if shape <= 0.0 || rate <= 0.0 || shape.is_nan() {
1380        return Err(RError::new(
1381            RErrorKind::Argument,
1382            "pgamma(): 'shape' and 'rate' must be positive".to_string(),
1383        ));
1384    }
1385    let scale = 1.0 / rate;
1386    dist_vectorize_p(args, "pgamma", lower_tail, log_p, |q| {
1387        if q <= 0.0 {
1388            0.0
1389        } else {
1390            regularized_gamma_p(shape, q / scale)
1391        }
1392    })
1393}
1394
1395/// Gamma quantile function (inverse CDF).
1396///
1397/// Uses bisection on the CDF.
1398///
1399/// @param p probability vector
1400/// @param shape shape parameter
1401/// @param rate rate parameter (default 1)
1402/// @return numeric vector of quantiles
1403#[builtin(min_args = 2, namespace = "stats")]
1404fn builtin_qgamma(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1405    let shape = extract_param(args, named, "shape", 1, f64::NAN);
1406    let rate = extract_param(args, named, "rate", 2, 1.0);
1407    let lower_tail = extract_lower_tail(named);
1408    let log_p = extract_log_p(named);
1409    if shape <= 0.0 || rate <= 0.0 || shape.is_nan() {
1410        return Err(RError::new(
1411            RErrorKind::Argument,
1412            "qgamma(): 'shape' and 'rate' must be positive".to_string(),
1413        ));
1414    }
1415    let scale = 1.0 / rate;
1416    dist_vectorize_q(args, "qgamma", lower_tail, log_p, |p| {
1417        if !(0.0..=1.0).contains(&p) {
1418            f64::NAN
1419        } else if p == 0.0 {
1420            0.0
1421        } else if p == 1.0 {
1422            f64::INFINITY
1423        } else {
1424            let mean = shape * scale;
1425            quantile_bisect(p, 0.0, mean * 5.0 + 10.0, |x| {
1426                regularized_gamma_p(shape, x / scale)
1427            })
1428        }
1429    })
1430}
1431
1432// endregion
1433
1434// region: Beta distribution (dbeta, pbeta, qbeta)
1435
1436/// Beta density function.
1437///
1438/// @param x quantile vector (values in [0, 1])
1439/// @param shape1 first shape parameter
1440/// @param shape2 second shape parameter
1441/// @return numeric vector of densities
1442#[builtin(min_args = 3, namespace = "stats")]
1443fn builtin_dbeta(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1444    let shape1 = extract_param(args, named, "shape1", 1, f64::NAN);
1445    let shape2 = extract_param(args, named, "shape2", 2, f64::NAN);
1446    let log_flag = extract_log_flag(named);
1447    if shape1 <= 0.0 || shape2 <= 0.0 || shape1.is_nan() || shape2.is_nan() {
1448        return Err(RError::new(
1449            RErrorKind::Argument,
1450            "dbeta(): 'shape1' and 'shape2' must be positive".to_string(),
1451        ));
1452    }
1453    let log_beta = ln_gamma(shape1) + ln_gamma(shape2) - ln_gamma(shape1 + shape2);
1454    dist_vectorize_d(args, "dbeta", log_flag, |x| {
1455        if !(0.0..=1.0).contains(&x) {
1456            0.0
1457        } else if x == 0.0 {
1458            if shape1 < 1.0 {
1459                f64::INFINITY
1460            } else if shape1 == 1.0 {
1461                shape2
1462            } else {
1463                0.0
1464            }
1465        } else if x == 1.0 {
1466            if shape2 < 1.0 {
1467                f64::INFINITY
1468            } else if shape2 == 1.0 {
1469                shape1
1470            } else {
1471                0.0
1472            }
1473        } else {
1474            ((shape1 - 1.0) * x.ln() + (shape2 - 1.0) * (1.0 - x).ln() - log_beta).exp()
1475        }
1476    })
1477}
1478
1479/// Beta cumulative distribution function.
1480///
1481/// Uses the regularized incomplete beta function.
1482///
1483/// @param q quantile vector
1484/// @param shape1 first shape parameter
1485/// @param shape2 second shape parameter
1486/// @return numeric vector of probabilities
1487#[builtin(min_args = 3, namespace = "stats")]
1488fn builtin_pbeta(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1489    let shape1 = extract_param(args, named, "shape1", 1, f64::NAN);
1490    let shape2 = extract_param(args, named, "shape2", 2, f64::NAN);
1491    let lower_tail = extract_lower_tail(named);
1492    let log_p = extract_log_p(named);
1493    if shape1 <= 0.0 || shape2 <= 0.0 || shape1.is_nan() || shape2.is_nan() {
1494        return Err(RError::new(
1495            RErrorKind::Argument,
1496            "pbeta(): 'shape1' and 'shape2' must be positive".to_string(),
1497        ));
1498    }
1499    dist_vectorize_p(args, "pbeta", lower_tail, log_p, |q| {
1500        if q <= 0.0 {
1501            0.0
1502        } else if q >= 1.0 {
1503            1.0
1504        } else {
1505            regularized_beta(q, shape1, shape2)
1506        }
1507    })
1508}
1509
1510/// Beta quantile function (inverse CDF).
1511///
1512/// Uses bisection on the CDF.
1513///
1514/// @param p probability vector
1515/// @param shape1 first shape parameter
1516/// @param shape2 second shape parameter
1517/// @return numeric vector of quantiles
1518#[builtin(min_args = 3, namespace = "stats")]
1519fn builtin_qbeta(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1520    let shape1 = extract_param(args, named, "shape1", 1, f64::NAN);
1521    let shape2 = extract_param(args, named, "shape2", 2, f64::NAN);
1522    let lower_tail = extract_lower_tail(named);
1523    let log_p = extract_log_p(named);
1524    if shape1 <= 0.0 || shape2 <= 0.0 || shape1.is_nan() || shape2.is_nan() {
1525        return Err(RError::new(
1526            RErrorKind::Argument,
1527            "qbeta(): 'shape1' and 'shape2' must be positive".to_string(),
1528        ));
1529    }
1530    dist_vectorize_q(args, "qbeta", lower_tail, log_p, |p| {
1531        if !(0.0..=1.0).contains(&p) {
1532            f64::NAN
1533        } else if p == 0.0 {
1534            0.0
1535        } else if p == 1.0 {
1536            1.0
1537        } else {
1538            quantile_bisect(p, 0.0, 1.0, |x| regularized_beta(x, shape1, shape2))
1539        }
1540    })
1541}
1542
1543// endregion
1544
1545// region: Cauchy distribution (dcauchy, pcauchy, qcauchy)
1546
1547/// Cauchy density function.
1548///
1549/// @param x quantile vector
1550/// @param location location parameter (default 0)
1551/// @param scale scale parameter (default 1)
1552/// @return numeric vector of densities
1553#[builtin(min_args = 1, namespace = "stats")]
1554fn builtin_dcauchy(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1555    let location = extract_param(args, named, "location", 1, 0.0);
1556    let scale = extract_param(args, named, "scale", 2, 1.0);
1557    let log_flag = extract_log_flag(named);
1558    if scale <= 0.0 {
1559        return Err(RError::new(
1560            RErrorKind::Argument,
1561            "dcauchy(): 'scale' must be positive".to_string(),
1562        ));
1563    }
1564    dist_vectorize_d(args, "dcauchy", log_flag, |x| {
1565        let z = (x - location) / scale;
1566        1.0 / (PI * scale * (1.0 + z * z))
1567    })
1568}
1569
1570/// Cauchy cumulative distribution function.
1571///
1572/// @param q quantile vector
1573/// @param location location parameter (default 0)
1574/// @param scale scale parameter (default 1)
1575/// @return numeric vector of probabilities
1576#[builtin(min_args = 1, namespace = "stats")]
1577fn builtin_pcauchy(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1578    let location = extract_param(args, named, "location", 1, 0.0);
1579    let scale = extract_param(args, named, "scale", 2, 1.0);
1580    let lower_tail = extract_lower_tail(named);
1581    let log_p = extract_log_p(named);
1582    if scale <= 0.0 {
1583        return Err(RError::new(
1584            RErrorKind::Argument,
1585            "pcauchy(): 'scale' must be positive".to_string(),
1586        ));
1587    }
1588    dist_vectorize_p(args, "pcauchy", lower_tail, log_p, |q| {
1589        0.5 + ((q - location) / scale).atan() / PI
1590    })
1591}
1592
1593/// Cauchy quantile function (inverse CDF).
1594///
1595/// @param p probability vector
1596/// @param location location parameter (default 0)
1597/// @param scale scale parameter (default 1)
1598/// @return numeric vector of quantiles
1599#[builtin(min_args = 1, namespace = "stats")]
1600fn builtin_qcauchy(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1601    let location = extract_param(args, named, "location", 1, 0.0);
1602    let scale = extract_param(args, named, "scale", 2, 1.0);
1603    let lower_tail = extract_lower_tail(named);
1604    let log_p = extract_log_p(named);
1605    if scale <= 0.0 {
1606        return Err(RError::new(
1607            RErrorKind::Argument,
1608            "qcauchy(): 'scale' must be positive".to_string(),
1609        ));
1610    }
1611    dist_vectorize_q(args, "qcauchy", lower_tail, log_p, |p| {
1612        if !(0.0..=1.0).contains(&p) {
1613            f64::NAN
1614        } else if p == 0.0 {
1615            f64::NEG_INFINITY
1616        } else if p == 1.0 {
1617            f64::INFINITY
1618        } else {
1619            location + scale * (PI * (p - 0.5)).tan()
1620        }
1621    })
1622}
1623
1624// endregion
1625
1626// region: Weibull distribution (dweibull, pweibull, qweibull)
1627
1628/// Weibull density function.
1629///
1630/// @param x quantile vector
1631/// @param shape shape parameter
1632/// @param scale scale parameter (default 1)
1633/// @return numeric vector of densities
1634#[builtin(min_args = 2, namespace = "stats")]
1635fn builtin_dweibull(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1636    let shape = extract_param(args, named, "shape", 1, f64::NAN);
1637    let scale = extract_param(args, named, "scale", 2, 1.0);
1638    let log_flag = extract_log_flag(named);
1639    if shape <= 0.0 || scale <= 0.0 || shape.is_nan() {
1640        return Err(RError::new(
1641            RErrorKind::Argument,
1642            "dweibull(): 'shape' and 'scale' must be positive".to_string(),
1643        ));
1644    }
1645    dist_vectorize_d(args, "dweibull", log_flag, |x| {
1646        if x < 0.0 {
1647            0.0
1648        } else if x == 0.0 {
1649            if shape < 1.0 {
1650                f64::INFINITY
1651            } else if shape == 1.0 {
1652                1.0 / scale
1653            } else {
1654                0.0
1655            }
1656        } else {
1657            let z = x / scale;
1658            (shape / scale) * z.powf(shape - 1.0) * (-z.powf(shape)).exp()
1659        }
1660    })
1661}
1662
1663/// Weibull cumulative distribution function.
1664///
1665/// @param q quantile vector
1666/// @param shape shape parameter
1667/// @param scale scale parameter (default 1)
1668/// @return numeric vector of probabilities
1669#[builtin(min_args = 2, namespace = "stats")]
1670fn builtin_pweibull(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1671    let shape = extract_param(args, named, "shape", 1, f64::NAN);
1672    let scale = extract_param(args, named, "scale", 2, 1.0);
1673    let lower_tail = extract_lower_tail(named);
1674    let log_p = extract_log_p(named);
1675    if shape <= 0.0 || scale <= 0.0 || shape.is_nan() {
1676        return Err(RError::new(
1677            RErrorKind::Argument,
1678            "pweibull(): 'shape' and 'scale' must be positive".to_string(),
1679        ));
1680    }
1681    dist_vectorize_p(args, "pweibull", lower_tail, log_p, |q| {
1682        if q <= 0.0 {
1683            0.0
1684        } else {
1685            1.0 - (-(q / scale).powf(shape)).exp()
1686        }
1687    })
1688}
1689
1690/// Weibull quantile function (inverse CDF).
1691///
1692/// @param p probability vector
1693/// @param shape shape parameter
1694/// @param scale scale parameter (default 1)
1695/// @return numeric vector of quantiles
1696#[builtin(min_args = 2, namespace = "stats")]
1697fn builtin_qweibull(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1698    let shape = extract_param(args, named, "shape", 1, f64::NAN);
1699    let scale = extract_param(args, named, "scale", 2, 1.0);
1700    let lower_tail = extract_lower_tail(named);
1701    let log_p = extract_log_p(named);
1702    if shape <= 0.0 || scale <= 0.0 || shape.is_nan() {
1703        return Err(RError::new(
1704            RErrorKind::Argument,
1705            "qweibull(): 'shape' and 'scale' must be positive".to_string(),
1706        ));
1707    }
1708    dist_vectorize_q(args, "qweibull", lower_tail, log_p, |p| {
1709        if !(0.0..=1.0).contains(&p) {
1710            f64::NAN
1711        } else if p == 0.0 {
1712            0.0
1713        } else if p == 1.0 {
1714            f64::INFINITY
1715        } else {
1716            scale * (-(1.0 - p).ln()).powf(1.0 / shape)
1717        }
1718    })
1719}
1720
1721// endregion
1722
1723// region: Log-normal distribution (dlnorm, plnorm, qlnorm)
1724
1725/// Log-normal density function.
1726///
1727/// @param x quantile vector
1728/// @param meanlog mean of the distribution on the log scale (default 0)
1729/// @param sdlog standard deviation on the log scale (default 1)
1730/// @return numeric vector of densities
1731#[builtin(min_args = 1, namespace = "stats")]
1732fn builtin_dlnorm(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1733    let meanlog = extract_param(args, named, "meanlog", 1, 0.0);
1734    let sdlog = extract_param(args, named, "sdlog", 2, 1.0);
1735    let log_flag = extract_log_flag(named);
1736    if sdlog < 0.0 {
1737        return Err(RError::new(
1738            RErrorKind::Argument,
1739            "dlnorm(): 'sdlog' must be non-negative".to_string(),
1740        ));
1741    }
1742    dist_vectorize_d(args, "dlnorm", log_flag, |x| {
1743        if x <= 0.0 {
1744            0.0
1745        } else if sdlog == 0.0 {
1746            if (x.ln() - meanlog).abs() < f64::EPSILON {
1747                f64::INFINITY
1748            } else {
1749                0.0
1750            }
1751        } else {
1752            let z = (x.ln() - meanlog) / sdlog;
1753            std_normal_pdf(z) / (x * sdlog)
1754        }
1755    })
1756}
1757
1758/// Log-normal cumulative distribution function.
1759///
1760/// @param q quantile vector
1761/// @param meanlog mean of the distribution on the log scale (default 0)
1762/// @param sdlog standard deviation on the log scale (default 1)
1763/// @return numeric vector of probabilities
1764#[builtin(min_args = 1, namespace = "stats")]
1765fn builtin_plnorm(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1766    let meanlog = extract_param(args, named, "meanlog", 1, 0.0);
1767    let sdlog = extract_param(args, named, "sdlog", 2, 1.0);
1768    let lower_tail = extract_lower_tail(named);
1769    let log_p = extract_log_p(named);
1770    if sdlog < 0.0 {
1771        return Err(RError::new(
1772            RErrorKind::Argument,
1773            "plnorm(): 'sdlog' must be non-negative".to_string(),
1774        ));
1775    }
1776    dist_vectorize_p(args, "plnorm", lower_tail, log_p, |q| {
1777        if q <= 0.0 {
1778            0.0
1779        } else if sdlog == 0.0 {
1780            if q.ln() < meanlog {
1781                0.0
1782            } else {
1783                1.0
1784            }
1785        } else {
1786            let z = (q.ln() - meanlog) / sdlog;
1787            std_normal_cdf(z)
1788        }
1789    })
1790}
1791
1792/// Log-normal quantile function (inverse CDF).
1793///
1794/// @param p probability vector
1795/// @param meanlog mean of the distribution on the log scale (default 0)
1796/// @param sdlog standard deviation on the log scale (default 1)
1797/// @return numeric vector of quantiles
1798#[builtin(min_args = 1, namespace = "stats")]
1799fn builtin_qlnorm(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1800    let meanlog = extract_param(args, named, "meanlog", 1, 0.0);
1801    let sdlog = extract_param(args, named, "sdlog", 2, 1.0);
1802    let lower_tail = extract_lower_tail(named);
1803    let log_p = extract_log_p(named);
1804    if sdlog < 0.0 {
1805        return Err(RError::new(
1806            RErrorKind::Argument,
1807            "qlnorm(): 'sdlog' must be non-negative".to_string(),
1808        ));
1809    }
1810    dist_vectorize_q(args, "qlnorm", lower_tail, log_p, |p| {
1811        if !(0.0..=1.0).contains(&p) {
1812            f64::NAN
1813        } else if p == 0.0 {
1814            0.0
1815        } else if p == 1.0 {
1816            f64::INFINITY
1817        } else {
1818            (std_normal_quantile(p) * sdlog + meanlog).exp()
1819        }
1820    })
1821}
1822
1823// endregion
1824
1825// region: Chi-squared distribution (dchisq, pchisq, qchisq)
1826
1827/// Chi-squared density function.
1828///
1829/// The chi-squared distribution with df degrees of freedom is
1830/// Gamma(df/2, 1/2), so dchisq(x, df) = dgamma(x, df/2, rate = 0.5).
1831///
1832/// @param x quantile vector
1833/// @param df degrees of freedom
1834/// @return numeric vector of densities
1835#[builtin(min_args = 2, namespace = "stats")]
1836fn builtin_dchisq(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1837    let df = extract_param(args, named, "df", 1, f64::NAN);
1838    let log_flag = extract_log_flag(named);
1839    if df <= 0.0 || df.is_nan() {
1840        return Err(RError::new(
1841            RErrorKind::Argument,
1842            "dchisq(): 'df' must be positive".to_string(),
1843        ));
1844    }
1845    let shape = df / 2.0;
1846    let scale: f64 = 2.0;
1847    let log_norm = shape * scale.ln() + ln_gamma(shape);
1848    dist_vectorize_d(args, "dchisq", log_flag, |x| {
1849        if x < 0.0 {
1850            0.0
1851        } else if x == 0.0 {
1852            if shape < 1.0 {
1853                f64::INFINITY
1854            } else if shape == 1.0 {
1855                0.5
1856            } else {
1857                0.0
1858            }
1859        } else {
1860            ((shape - 1.0) * x.ln() - x / scale - log_norm).exp()
1861        }
1862    })
1863}
1864
1865/// Chi-squared cumulative distribution function.
1866///
1867/// @param q quantile vector
1868/// @param df degrees of freedom
1869/// @return numeric vector of probabilities
1870#[builtin(min_args = 2, namespace = "stats")]
1871fn builtin_pchisq(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1872    let df = extract_param(args, named, "df", 1, f64::NAN);
1873    let lower_tail = extract_lower_tail(named);
1874    let log_p = extract_log_p(named);
1875    if df <= 0.0 || df.is_nan() {
1876        return Err(RError::new(
1877            RErrorKind::Argument,
1878            "pchisq(): 'df' must be positive".to_string(),
1879        ));
1880    }
1881    let shape = df / 2.0;
1882    dist_vectorize_p(args, "pchisq", lower_tail, log_p, |q| {
1883        if q <= 0.0 {
1884            0.0
1885        } else {
1886            regularized_gamma_p(shape, q / 2.0)
1887        }
1888    })
1889}
1890
1891/// Chi-squared quantile function (inverse CDF).
1892///
1893/// @param p probability vector
1894/// @param df degrees of freedom
1895/// @return numeric vector of quantiles
1896#[builtin(min_args = 2, namespace = "stats")]
1897fn builtin_qchisq(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1898    let df = extract_param(args, named, "df", 1, f64::NAN);
1899    let lower_tail = extract_lower_tail(named);
1900    let log_p = extract_log_p(named);
1901    if df <= 0.0 || df.is_nan() {
1902        return Err(RError::new(
1903            RErrorKind::Argument,
1904            "qchisq(): 'df' must be positive".to_string(),
1905        ));
1906    }
1907    let shape = df / 2.0;
1908    dist_vectorize_q(args, "qchisq", lower_tail, log_p, |p| {
1909        if !(0.0..=1.0).contains(&p) {
1910            f64::NAN
1911        } else if p == 0.0 {
1912            0.0
1913        } else if p == 1.0 {
1914            f64::INFINITY
1915        } else {
1916            quantile_bisect(p, 0.0, df * 5.0 + 10.0, |x| {
1917                regularized_gamma_p(shape, x / 2.0)
1918            })
1919        }
1920    })
1921}
1922
1923// endregion
1924
1925// region: Student's t distribution (dt, pt, qt)
1926
1927/// Student's t density function.
1928///
1929/// dt(x, df) = gamma((df+1)/2) / (sqrt(df*pi) * gamma(df/2)) * (1 + x^2/df)^(-(df+1)/2)
1930///
1931/// @param x quantile vector
1932/// @param df degrees of freedom
1933/// @return numeric vector of densities
1934#[builtin(min_args = 2, namespace = "stats")]
1935fn builtin_dt(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1936    let df = extract_param(args, named, "df", 1, f64::NAN);
1937    let log_flag = extract_log_flag(named);
1938    if df <= 0.0 || df.is_nan() {
1939        return Err(RError::new(
1940            RErrorKind::Argument,
1941            "dt(): 'df' must be positive".to_string(),
1942        ));
1943    }
1944    let log_const = ln_gamma((df + 1.0) / 2.0) - ln_gamma(df / 2.0) - 0.5 * (df * PI).ln();
1945    dist_vectorize_d(args, "dt", log_flag, |x| {
1946        (log_const + (-(df + 1.0) / 2.0) * (1.0 + x * x / df).ln()).exp()
1947    })
1948}
1949
1950/// Student's t cumulative distribution function.
1951///
1952/// Uses the regularized incomplete beta function:
1953/// pt(x, df) = 1 - 0.5 * I_{df/(df+x^2)}(df/2, 1/2) for x >= 0
1954///
1955/// @param q quantile vector
1956/// @param df degrees of freedom
1957/// @return numeric vector of probabilities
1958#[builtin(min_args = 2, namespace = "stats")]
1959fn builtin_pt(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1960    let df = extract_param(args, named, "df", 1, f64::NAN);
1961    let lower_tail = extract_lower_tail(named);
1962    let log_p = extract_log_p(named);
1963    if df <= 0.0 || df.is_nan() {
1964        return Err(RError::new(
1965            RErrorKind::Argument,
1966            "pt(): 'df' must be positive".to_string(),
1967        ));
1968    }
1969    dist_vectorize_p(args, "pt", lower_tail, log_p, |q| {
1970        let x2 = q * q;
1971        let t = df / (df + x2);
1972        let half_ib = 0.5 * regularized_beta(t, df / 2.0, 0.5);
1973        if q >= 0.0 {
1974            1.0 - half_ib
1975        } else {
1976            half_ib
1977        }
1978    })
1979}
1980
1981/// Student's t quantile function (inverse CDF).
1982///
1983/// Uses bisection on the CDF.
1984///
1985/// @param p probability vector
1986/// @param df degrees of freedom
1987/// @return numeric vector of quantiles
1988#[builtin(min_args = 2, namespace = "stats")]
1989fn builtin_qt(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1990    let df = extract_param(args, named, "df", 1, f64::NAN);
1991    let lower_tail = extract_lower_tail(named);
1992    let log_p = extract_log_p(named);
1993    if df <= 0.0 || df.is_nan() {
1994        return Err(RError::new(
1995            RErrorKind::Argument,
1996            "qt(): 'df' must be positive".to_string(),
1997        ));
1998    }
1999    dist_vectorize_q(args, "qt", lower_tail, log_p, |p| {
2000        if !(0.0..=1.0).contains(&p) {
2001            f64::NAN
2002        } else if p == 0.0 {
2003            f64::NEG_INFINITY
2004        } else if p == 1.0 {
2005            f64::INFINITY
2006        } else {
2007            let pt_cdf = |q: f64| -> f64 {
2008                let x2 = q * q;
2009                let t = df / (df + x2);
2010                let half_ib = 0.5 * regularized_beta(t, df / 2.0, 0.5);
2011                if q >= 0.0 {
2012                    1.0 - half_ib
2013                } else {
2014                    half_ib
2015                }
2016            };
2017            let z = std_normal_quantile(p);
2018            let lo = z * 5.0 - 10.0;
2019            let hi = z * 5.0 + 10.0;
2020            quantile_bisect(p, lo, hi, pt_cdf)
2021        }
2022    })
2023}
2024
2025// endregion
2026
2027// region: F distribution (df, pf, qf)
2028
2029/// F density function.
2030///
2031/// @param x quantile vector
2032/// @param df1 numerator degrees of freedom
2033/// @param df2 denominator degrees of freedom
2034/// @return numeric vector of densities
2035#[builtin(name = "df", min_args = 3, namespace = "stats")]
2036fn builtin_df_dist(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2037    let df1 = extract_param(args, named, "df1", 1, f64::NAN);
2038    let df2 = extract_param(args, named, "df2", 2, f64::NAN);
2039    let log_flag = extract_log_flag(named);
2040    if df1 <= 0.0 || df2 <= 0.0 || df1.is_nan() || df2.is_nan() {
2041        return Err(RError::new(
2042            RErrorKind::Argument,
2043            "df(): 'df1' and 'df2' must be positive".to_string(),
2044        ));
2045    }
2046    let log_const = 0.5 * (df1 * df1.ln() + df2 * df2.ln()) + ln_gamma((df1 + df2) / 2.0)
2047        - ln_gamma(df1 / 2.0)
2048        - ln_gamma(df2 / 2.0);
2049    dist_vectorize_d(args, "df", log_flag, |x| {
2050        if x < 0.0 {
2051            0.0
2052        } else if x == 0.0 {
2053            if df1 < 2.0 {
2054                f64::INFINITY
2055            } else if df1 == 2.0 {
2056                1.0
2057            } else {
2058                0.0
2059            }
2060        } else {
2061            (log_const + (df1 / 2.0 - 1.0) * x.ln() - ((df1 + df2) / 2.0) * (df1 * x + df2).ln())
2062                .exp()
2063        }
2064    })
2065}
2066
2067/// F cumulative distribution function.
2068///
2069/// Uses the regularized incomplete beta function:
2070/// pf(x, d1, d2) = I_{d1*x/(d1*x+d2)}(d1/2, d2/2)
2071///
2072/// @param q quantile vector
2073/// @param df1 numerator degrees of freedom
2074/// @param df2 denominator degrees of freedom
2075/// @return numeric vector of probabilities
2076#[builtin(min_args = 3, namespace = "stats")]
2077fn builtin_pf(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2078    let df1 = extract_param(args, named, "df1", 1, f64::NAN);
2079    let df2 = extract_param(args, named, "df2", 2, f64::NAN);
2080    let lower_tail = extract_lower_tail(named);
2081    let log_p = extract_log_p(named);
2082    if df1 <= 0.0 || df2 <= 0.0 || df1.is_nan() || df2.is_nan() {
2083        return Err(RError::new(
2084            RErrorKind::Argument,
2085            "pf(): 'df1' and 'df2' must be positive".to_string(),
2086        ));
2087    }
2088    dist_vectorize_p(args, "pf", lower_tail, log_p, |q| {
2089        if q <= 0.0 {
2090            0.0
2091        } else {
2092            let t = df1 * q / (df1 * q + df2);
2093            regularized_beta(t, df1 / 2.0, df2 / 2.0)
2094        }
2095    })
2096}
2097
2098/// F quantile function (inverse CDF).
2099///
2100/// Uses bisection on the CDF.
2101///
2102/// @param p probability vector
2103/// @param df1 numerator degrees of freedom
2104/// @param df2 denominator degrees of freedom
2105/// @return numeric vector of quantiles
2106#[builtin(min_args = 3, namespace = "stats")]
2107fn builtin_qf(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2108    let df1 = extract_param(args, named, "df1", 1, f64::NAN);
2109    let df2 = extract_param(args, named, "df2", 2, f64::NAN);
2110    let lower_tail = extract_lower_tail(named);
2111    let log_p = extract_log_p(named);
2112    if df1 <= 0.0 || df2 <= 0.0 || df1.is_nan() || df2.is_nan() {
2113        return Err(RError::new(
2114            RErrorKind::Argument,
2115            "qf(): 'df1' and 'df2' must be positive".to_string(),
2116        ));
2117    }
2118    dist_vectorize_q(args, "qf", lower_tail, log_p, |p| {
2119        if !(0.0..=1.0).contains(&p) {
2120            f64::NAN
2121        } else if p == 0.0 {
2122            0.0
2123        } else if p == 1.0 {
2124            f64::INFINITY
2125        } else {
2126            let pf_cdf = |q: f64| -> f64 {
2127                let t = df1 * q / (df1 * q + df2);
2128                regularized_beta(t, df1 / 2.0, df2 / 2.0)
2129            };
2130            let mean_est = if df2 > 2.0 { df2 / (df2 - 2.0) } else { 1.0 };
2131            quantile_bisect(p, 0.0, mean_est * 10.0 + 10.0, pf_cdf)
2132        }
2133    })
2134}
2135
2136// endregion
2137
2138// region: Binomial distribution (dbinom, pbinom, qbinom)
2139
2140/// Binomial density (probability mass) function.
2141///
2142/// dbinom(x, size, prob) = choose(size, x) * prob^x * (1-prob)^(size-x)
2143///
2144/// @param x quantile vector (integer values)
2145/// @param size number of trials
2146/// @param prob probability of success on each trial
2147/// @return numeric vector of probabilities
2148#[builtin(min_args = 3, namespace = "stats")]
2149fn builtin_dbinom(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2150    let size = extract_param(args, named, "size", 1, f64::NAN);
2151    let prob = extract_param(args, named, "prob", 2, f64::NAN);
2152    let log_flag = extract_log_flag(named);
2153    if size < 0.0 || size.is_nan() || size != size.floor() {
2154        return Err(RError::new(
2155            RErrorKind::Argument,
2156            "dbinom(): 'size' must be a non-negative integer".to_string(),
2157        ));
2158    }
2159    if !(0.0..=1.0).contains(&prob) || prob.is_nan() {
2160        return Err(RError::new(
2161            RErrorKind::Argument,
2162            "dbinom(): 'prob' must be in [0, 1]".to_string(),
2163        ));
2164    }
2165    let n = size;
2166    dist_vectorize_d(args, "dbinom", log_flag, |x| {
2167        let k = x.round();
2168        if (x - k).abs() > 1e-7 || k < 0.0 || k > n {
2169            0.0
2170        } else if prob == 0.0 {
2171            if k == 0.0 {
2172                1.0
2173            } else {
2174                0.0
2175            }
2176        } else if prob == 1.0 {
2177            if k == n {
2178                1.0
2179            } else {
2180                0.0
2181            }
2182        } else {
2183            (lchoose(n, k) + k * prob.ln() + (n - k) * (1.0 - prob).ln()).exp()
2184        }
2185    })
2186}
2187
2188/// Binomial cumulative distribution function.
2189///
2190/// pbinom(q, size, prob) = sum_{k=0}^{floor(q)} dbinom(k, size, prob)
2191/// Uses regularized beta: pbinom(k, n, p) = 1 - I_p(k+1, n-k)
2192///
2193/// @param q quantile vector
2194/// @param size number of trials
2195/// @param prob probability of success on each trial
2196/// @return numeric vector of probabilities
2197#[builtin(min_args = 3, namespace = "stats")]
2198fn builtin_pbinom(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2199    let size = extract_param(args, named, "size", 1, f64::NAN);
2200    let prob = extract_param(args, named, "prob", 2, f64::NAN);
2201    let lower_tail = extract_lower_tail(named);
2202    let log_p = extract_log_p(named);
2203    if size < 0.0 || size.is_nan() || size != size.floor() {
2204        return Err(RError::new(
2205            RErrorKind::Argument,
2206            "pbinom(): 'size' must be a non-negative integer".to_string(),
2207        ));
2208    }
2209    if !(0.0..=1.0).contains(&prob) || prob.is_nan() {
2210        return Err(RError::new(
2211            RErrorKind::Argument,
2212            "pbinom(): 'prob' must be in [0, 1]".to_string(),
2213        ));
2214    }
2215    let n = size as i64;
2216    dist_vectorize_p(args, "pbinom", lower_tail, log_p, |q| {
2217        if q < 0.0 {
2218            0.0
2219        } else if q >= size {
2220            1.0
2221        } else {
2222            let k = q.floor() as i64;
2223            let kf = k as f64;
2224            1.0 - regularized_beta(prob, kf + 1.0, (n - k) as f64)
2225        }
2226    })
2227}
2228
2229/// Binomial quantile function (inverse CDF).
2230///
2231/// Uses bisection on the discrete CDF.
2232///
2233/// @param p probability vector
2234/// @param size number of trials
2235/// @param prob probability of success on each trial
2236/// @return numeric vector of quantiles
2237#[builtin(min_args = 3, namespace = "stats")]
2238fn builtin_qbinom(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2239    let size = extract_param(args, named, "size", 1, f64::NAN);
2240    let prob = extract_param(args, named, "prob", 2, f64::NAN);
2241    let lower_tail = extract_lower_tail(named);
2242    let log_p = extract_log_p(named);
2243    if size < 0.0 || size.is_nan() || size != size.floor() {
2244        return Err(RError::new(
2245            RErrorKind::Argument,
2246            "qbinom(): 'size' must be a non-negative integer".to_string(),
2247        ));
2248    }
2249    if !(0.0..=1.0).contains(&prob) || prob.is_nan() {
2250        return Err(RError::new(
2251            RErrorKind::Argument,
2252            "qbinom(): 'prob' must be in [0, 1]".to_string(),
2253        ));
2254    }
2255    let n = size as i64;
2256    dist_vectorize_q(args, "qbinom", lower_tail, log_p, |p| {
2257        if !(0.0..=1.0).contains(&p) {
2258            f64::NAN
2259        } else if p == 0.0 {
2260            0.0
2261        } else if p == 1.0 {
2262            size
2263        } else {
2264            quantile_bisect_discrete(p, 0, n, |k| {
2265                let kf = k as f64;
2266                1.0 - regularized_beta(prob, kf + 1.0, (n - k) as f64)
2267            })
2268        }
2269    })
2270}
2271
2272// endregion
2273
2274// region: Poisson distribution (dpois, ppois, qpois)
2275
2276/// Poisson density (probability mass) function.
2277///
2278/// dpois(x, lambda) = lambda^x * exp(-lambda) / x!
2279///
2280/// @param x quantile vector (non-negative integers)
2281/// @param lambda mean rate parameter
2282/// @return numeric vector of probabilities
2283#[builtin(min_args = 2, namespace = "stats")]
2284fn builtin_dpois(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2285    let lambda = extract_param(args, named, "lambda", 1, f64::NAN);
2286    let log_flag = extract_log_flag(named);
2287    if lambda < 0.0 || lambda.is_nan() {
2288        return Err(RError::new(
2289            RErrorKind::Argument,
2290            "dpois(): 'lambda' must be non-negative".to_string(),
2291        ));
2292    }
2293    dist_vectorize_d(args, "dpois", log_flag, |x| {
2294        let k = x.round();
2295        if (x - k).abs() > 1e-7 || k < 0.0 {
2296            0.0
2297        } else if lambda == 0.0 {
2298            if k == 0.0 {
2299                1.0
2300            } else {
2301                0.0
2302            }
2303        } else {
2304            (k * lambda.ln() - lambda - ln_gamma(k + 1.0)).exp()
2305        }
2306    })
2307}
2308
2309/// Poisson cumulative distribution function.
2310///
2311/// ppois(q, lambda) = sum_{k=0}^{floor(q)} dpois(k, lambda)
2312/// Uses the regularized gamma: ppois(k, lambda) = 1 - P(k+1, lambda)
2313///
2314/// @param q quantile vector
2315/// @param lambda mean rate parameter
2316/// @return numeric vector of probabilities
2317#[builtin(min_args = 2, namespace = "stats")]
2318fn builtin_ppois(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2319    let lambda = extract_param(args, named, "lambda", 1, f64::NAN);
2320    let lower_tail = extract_lower_tail(named);
2321    let log_p = extract_log_p(named);
2322    if lambda < 0.0 || lambda.is_nan() {
2323        return Err(RError::new(
2324            RErrorKind::Argument,
2325            "ppois(): 'lambda' must be non-negative".to_string(),
2326        ));
2327    }
2328    dist_vectorize_p(args, "ppois", lower_tail, log_p, |q| {
2329        if q < 0.0 {
2330            0.0
2331        } else if lambda == 0.0 {
2332            1.0
2333        } else {
2334            let k = q.floor();
2335            1.0 - regularized_gamma_p(k + 1.0, lambda)
2336        }
2337    })
2338}
2339
2340/// Poisson quantile function (inverse CDF).
2341///
2342/// Uses bisection on the discrete CDF.
2343///
2344/// @param p probability vector
2345/// @param lambda mean rate parameter
2346/// @return numeric vector of quantiles
2347#[builtin(min_args = 2, namespace = "stats")]
2348fn builtin_qpois(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2349    let lambda = extract_param(args, named, "lambda", 1, f64::NAN);
2350    let lower_tail = extract_lower_tail(named);
2351    let log_p = extract_log_p(named);
2352    if lambda < 0.0 || lambda.is_nan() {
2353        return Err(RError::new(
2354            RErrorKind::Argument,
2355            "qpois(): 'lambda' must be non-negative".to_string(),
2356        ));
2357    }
2358    dist_vectorize_q(args, "qpois", lower_tail, log_p, |p| {
2359        if !(0.0..=1.0).contains(&p) {
2360            f64::NAN
2361        } else if p == 0.0 {
2362            0.0
2363        } else if p == 1.0 {
2364            f64::INFINITY
2365        } else if lambda == 0.0 {
2366            0.0
2367        } else {
2368            let hi = (lambda + 6.0 * lambda.sqrt() + 10.0).ceil() as i64;
2369            quantile_bisect_discrete(p, 0, hi, |k| {
2370                let kf = k as f64;
2371                1.0 - regularized_gamma_p(kf + 1.0, lambda)
2372            })
2373        }
2374    })
2375}
2376
2377// endregion
2378
2379// region: Geometric distribution (dgeom, pgeom, qgeom)
2380
2381/// Geometric density (probability mass) function.
2382///
2383/// dgeom(x, prob) = prob * (1-prob)^x (number of failures before first success)
2384///
2385/// @param x quantile vector (non-negative integers)
2386/// @param prob probability of success
2387/// @return numeric vector of probabilities
2388#[builtin(min_args = 2, namespace = "stats")]
2389fn builtin_dgeom(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2390    let prob = extract_param(args, named, "prob", 1, f64::NAN);
2391    let log_flag = extract_log_flag(named);
2392    if !(0.0..=1.0).contains(&prob) || prob.is_nan() {
2393        return Err(RError::new(
2394            RErrorKind::Argument,
2395            "dgeom(): 'prob' must be in (0, 1]".to_string(),
2396        ));
2397    }
2398    dist_vectorize_d(args, "dgeom", log_flag, |x| {
2399        let k = x.round();
2400        if (x - k).abs() > 1e-7 || k < 0.0 {
2401            0.0
2402        } else {
2403            prob * (1.0 - prob).powf(k)
2404        }
2405    })
2406}
2407
2408/// Geometric cumulative distribution function.
2409///
2410/// pgeom(q, prob) = 1 - (1-prob)^(floor(q)+1)
2411///
2412/// @param q quantile vector
2413/// @param prob probability of success
2414/// @return numeric vector of probabilities
2415#[builtin(min_args = 2, namespace = "stats")]
2416fn builtin_pgeom(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2417    let prob = extract_param(args, named, "prob", 1, f64::NAN);
2418    let lower_tail = extract_lower_tail(named);
2419    let log_p = extract_log_p(named);
2420    if !(0.0..=1.0).contains(&prob) || prob.is_nan() {
2421        return Err(RError::new(
2422            RErrorKind::Argument,
2423            "pgeom(): 'prob' must be in (0, 1]".to_string(),
2424        ));
2425    }
2426    dist_vectorize_p(args, "pgeom", lower_tail, log_p, |q| {
2427        if q < 0.0 {
2428            0.0
2429        } else {
2430            1.0 - (1.0 - prob).powf(q.floor() + 1.0)
2431        }
2432    })
2433}
2434
2435/// Geometric quantile function (inverse CDF).
2436///
2437/// qgeom(p, prob) = ceil(log(1-p) / log(1-prob)) - 1
2438///
2439/// @param p probability vector
2440/// @param prob probability of success
2441/// @return numeric vector of quantiles
2442#[builtin(min_args = 2, namespace = "stats")]
2443fn builtin_qgeom(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2444    let prob = extract_param(args, named, "prob", 1, f64::NAN);
2445    let lower_tail = extract_lower_tail(named);
2446    let log_p = extract_log_p(named);
2447    if !(0.0..=1.0).contains(&prob) || prob.is_nan() {
2448        return Err(RError::new(
2449            RErrorKind::Argument,
2450            "qgeom(): 'prob' must be in (0, 1]".to_string(),
2451        ));
2452    }
2453    dist_vectorize_q(args, "qgeom", lower_tail, log_p, |p| {
2454        if !(0.0..=1.0).contains(&p) {
2455            f64::NAN
2456        } else if p == 0.0 {
2457            0.0
2458        } else if p == 1.0 {
2459            f64::INFINITY
2460        } else if prob == 1.0 {
2461            0.0
2462        } else {
2463            ((1.0 - p).ln() / (1.0 - prob).ln() - 1.0).ceil().max(0.0)
2464        }
2465    })
2466}
2467
2468// endregion
2469
2470// region: Hypergeometric distribution (dhyper, phyper, qhyper)
2471
2472/// Hypergeometric density (probability mass) function.
2473///
2474/// dhyper(x, m, n, k) = choose(m,x) * choose(n,k-x) / choose(m+n,k)
2475///
2476/// @param x quantile vector (integer values)
2477/// @param m number of white balls in the urn
2478/// @param n number of black balls in the urn
2479/// @param k number of balls drawn from the urn
2480/// @return numeric vector of probabilities
2481#[builtin(min_args = 4, namespace = "stats")]
2482fn builtin_dhyper(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2483    let m = extract_param(args, named, "m", 1, f64::NAN);
2484    let n = extract_param(args, named, "n", 2, f64::NAN);
2485    let k = extract_param(args, named, "k", 3, f64::NAN);
2486    let log_flag = extract_log_flag(named);
2487    if m < 0.0 || n < 0.0 || k < 0.0 || m.is_nan() || n.is_nan() || k.is_nan() {
2488        return Err(RError::new(
2489            RErrorKind::Argument,
2490            "dhyper(): parameters must be non-negative".to_string(),
2491        ));
2492    }
2493    if k > m + n {
2494        return Err(RError::new(
2495            RErrorKind::Argument,
2496            "dhyper(): 'k' must not exceed 'm + n'".to_string(),
2497        ));
2498    }
2499    dist_vectorize_d(args, "dhyper", log_flag, |x| {
2500        let xi = x.round();
2501        if (x - xi).abs() > 1e-7 || xi < 0.0 || xi > m || xi > k || (k - xi) > n {
2502            0.0
2503        } else {
2504            (lchoose(m, xi) + lchoose(n, k - xi) - lchoose(m + n, k)).exp()
2505        }
2506    })
2507}
2508
2509/// Hypergeometric cumulative distribution function.
2510///
2511/// @param q quantile vector
2512/// @param m number of white balls in the urn
2513/// @param n number of black balls in the urn
2514/// @param k number of balls drawn from the urn
2515/// @return numeric vector of probabilities
2516#[builtin(min_args = 4, namespace = "stats")]
2517fn builtin_phyper(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2518    let m = extract_param(args, named, "m", 1, f64::NAN);
2519    let n = extract_param(args, named, "n", 2, f64::NAN);
2520    let k = extract_param(args, named, "k", 3, f64::NAN);
2521    let lower_tail = extract_lower_tail(named);
2522    let log_p = extract_log_p(named);
2523    if m < 0.0 || n < 0.0 || k < 0.0 || m.is_nan() || n.is_nan() || k.is_nan() {
2524        return Err(RError::new(
2525            RErrorKind::Argument,
2526            "phyper(): parameters must be non-negative".to_string(),
2527        ));
2528    }
2529    if k > m + n {
2530        return Err(RError::new(
2531            RErrorKind::Argument,
2532            "phyper(): 'k' must not exceed 'm + n'".to_string(),
2533        ));
2534    }
2535    let lo = (k - n).max(0.0) as i64;
2536    dist_vectorize_p(args, "phyper", lower_tail, log_p, |q| {
2537        if q < lo as f64 {
2538            0.0
2539        } else if q >= m.min(k) {
2540            1.0
2541        } else {
2542            let qi = q.floor() as i64;
2543            let mut sum = 0.0;
2544            for j in lo..=qi {
2545                let jf = j as f64;
2546                sum += (lchoose(m, jf) + lchoose(n, k - jf) - lchoose(m + n, k)).exp();
2547            }
2548            sum.min(1.0)
2549        }
2550    })
2551}
2552
2553/// Hypergeometric quantile function (inverse CDF).
2554///
2555/// @param p probability vector
2556/// @param m number of white balls in the urn
2557/// @param n number of black balls in the urn
2558/// @param k number of balls drawn from the urn
2559/// @return numeric vector of quantiles
2560#[builtin(min_args = 4, namespace = "stats")]
2561fn builtin_qhyper(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2562    let m = extract_param(args, named, "m", 1, f64::NAN);
2563    let n = extract_param(args, named, "n", 2, f64::NAN);
2564    let k = extract_param(args, named, "k", 3, f64::NAN);
2565    let lower_tail = extract_lower_tail(named);
2566    let log_p = extract_log_p(named);
2567    if m < 0.0 || n < 0.0 || k < 0.0 || m.is_nan() || n.is_nan() || k.is_nan() {
2568        return Err(RError::new(
2569            RErrorKind::Argument,
2570            "qhyper(): parameters must be non-negative".to_string(),
2571        ));
2572    }
2573    if k > m + n {
2574        return Err(RError::new(
2575            RErrorKind::Argument,
2576            "qhyper(): 'k' must not exceed 'm + n'".to_string(),
2577        ));
2578    }
2579    let lo = (k - n).max(0.0) as i64;
2580    let hi = m.min(k) as i64;
2581    dist_vectorize_q(args, "qhyper", lower_tail, log_p, |p| {
2582        if !(0.0..=1.0).contains(&p) {
2583            f64::NAN
2584        } else if p == 0.0 {
2585            lo as f64
2586        } else if p == 1.0 {
2587            hi as f64
2588        } else {
2589            let mut cum = 0.0;
2590            for j in lo..=hi {
2591                let jf = j as f64;
2592                cum += (lchoose(m, jf) + lchoose(n, k - jf) - lchoose(m + n, k)).exp();
2593                if cum >= p {
2594                    return jf;
2595                }
2596            }
2597            hi as f64
2598        }
2599    })
2600}
2601
2602// endregion
r/interpreter/builtins/stats.rs

r/interpreter/builtins/
stats.rs