Skip to main content

r/interpreter/builtins/
strings.rs

1use std::collections::HashMap;
2
3use aho_corasick::AhoCorasick;
4use bstr::ByteSlice;
5use memchr::memmem;
6use unicode_width::UnicodeWidthStr;
7
8use super::CallArgs;
9use crate::interpreter::value::*;
10use crate::interpreter::BuiltinContext;
11use derive_more::{Display, Error};
12use minir_macros::{builtin, interpreter_builtin};
13use regex::Regex;
14
15use crate::interpreter::value::deparse_expr;
16
17// region: StringError
18
19/// Structured error type for string operations.
20#[derive(Debug, Display, Error)]
21pub enum StringError {
22    #[display("invalid regular expression: {}", source)]
23    InvalidRegex {
24        #[error(source)]
25        source: regex::Error,
26    },
27}
28
29impl From<StringError> for RError {
30    fn from(e: StringError) -> Self {
31        RError::from_source(RErrorKind::Argument, e)
32    }
33}
34
35// endregion
36
37/// Extract common regex options from named args: fixed, ignore.case, perl
38fn get_regex_opts(named: &[(String, RValue)]) -> (bool, bool) {
39    let fixed = named
40        .iter()
41        .find(|(n, _)| n == "fixed")
42        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
43        .unwrap_or(false);
44    let ignore_case = named
45        .iter()
46        .find(|(n, _)| n == "ignore.case")
47        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
48        .unwrap_or(false);
49    (fixed, ignore_case)
50}
51
52/// Build a compiled regex from a pattern string, respecting fixed and ignore.case options.
53/// Returns Err(RError) if the pattern is invalid regex.
54/// Translate R/PCRE regex patterns to Rust regex syntax.
55///
56/// Handles known incompatibilities:
57/// - `\]` inside character classes → `]` (PCRE allows escaping `]`, Rust doesn't)
58fn translate_pcre_to_rust(pattern: &str) -> String {
59    let mut result = String::with_capacity(pattern.len());
60    let mut in_class = false;
61    let chars: Vec<char> = pattern.chars().collect();
62    let mut i = 0;
63    while i < chars.len() {
64        match chars[i] {
65            '[' if !in_class => {
66                in_class = true;
67                result.push('[');
68                // Handle negation and literal ] at start of class
69                if i + 1 < chars.len() && chars[i + 1] == '^' {
70                    result.push('^');
71                    i += 1;
72                }
73                if i + 1 < chars.len() && chars[i + 1] == ']' {
74                    result.push(']');
75                    i += 1;
76                }
77            }
78            ']' if in_class => {
79                in_class = false;
80                result.push(']');
81            }
82            '\\' if in_class && i + 1 < chars.len() => {
83                if chars[i + 1] == '\\' {
84                    // \\ inside class: escaped backslash, push both
85                    result.push('\\');
86                    result.push('\\');
87                    i += 1;
88                } else if chars[i + 1] == ']' {
89                    // \] inside a character class → literal ] (PCRE escaped bracket)
90                    result.push(']');
91                    i += 1;
92                } else {
93                    // Other escapes inside class: pass through
94                    result.push('\\');
95                    result.push(chars[i + 1]);
96                    i += 1;
97                }
98            }
99            c => result.push(c),
100        }
101        i += 1;
102    }
103    result
104}
105
106fn build_regex(pattern: &str, fixed: bool, ignore_case: bool) -> Result<Regex, RError> {
107    let pat = if fixed {
108        regex::escape(pattern)
109    } else {
110        translate_pcre_to_rust(pattern)
111    };
112    let pat = if ignore_case {
113        format!("(?i){}", pat)
114    } else {
115        pat
116    };
117    Regex::new(&pat).map_err(|source| -> RError { StringError::InvalidRegex { source }.into() })
118}
119
120/// Convert R-style replacement backreferences (\1, \2) to regex crate style ($1, $2)
121fn convert_replacement(repl: &str) -> String {
122    let mut result = String::with_capacity(repl.len());
123    let chars: Vec<char> = repl.chars().collect();
124    let mut i = 0;
125    while i < chars.len() {
126        if chars[i] == '\\' && i + 1 < chars.len() && chars[i + 1].is_ascii_digit() {
127            result.push('$');
128            result.push(chars[i + 1]);
129            i += 2;
130        } else {
131            result.push(chars[i]);
132            i += 1;
133        }
134    }
135    result
136}
137
138// region: memchr fixed-pattern helpers
139
140/// Build an `AhoCorasick` automaton for a single fixed pattern, optionally case-insensitive.
141/// This builds the automaton once and amortizes the cost across many haystack searches.
142fn build_fixed_searcher(pattern: &str, ignore_case: bool) -> Result<AhoCorasick, RError> {
143    let mut builder = aho_corasick::AhoCorasickBuilder::new();
144    builder
145        .ascii_case_insensitive(ignore_case)
146        .kind(Some(aho_corasick::AhoCorasickKind::DFA));
147    builder.build([pattern]).map_err(|e| {
148        RError::new(
149            RErrorKind::Argument,
150            format!("failed to build fixed-pattern searcher: {e}"),
151        )
152    })
153}
154
155/// Replace all occurrences of `needle` in `haystack` with `replacement`.
156fn fixed_gsub(haystack: &str, needle: &str, replacement: &str) -> String {
157    if needle.is_empty() {
158        // Match R behavior for empty pattern: insert replacement before each char and after the last
159        let mut result =
160            String::with_capacity(haystack.len() + replacement.len() * (haystack.len() + 1));
161        for ch in haystack.chars() {
162            result.push_str(replacement);
163            result.push(ch);
164        }
165        result.push_str(replacement);
166        return result;
167    }
168    let finder = memmem::Finder::new(needle.as_bytes());
169    let mut result = String::with_capacity(haystack.len());
170    let mut last_end = 0;
171    for pos in finder.find_iter(haystack.as_bytes()) {
172        result.push_str(&haystack[last_end..pos]);
173        result.push_str(replacement);
174        last_end = pos + needle.len();
175    }
176    result.push_str(&haystack[last_end..]);
177    result
178}
179
180/// Split `haystack` on all occurrences of `needle`.
181fn fixed_split(haystack: &str, needle: &str) -> Vec<Option<String>> {
182    if needle.is_empty() {
183        // Empty split: split into individual characters (handled by caller)
184        return haystack.chars().map(|c| Some(c.to_string())).collect();
185    }
186    let finder = memmem::Finder::new(needle.as_bytes());
187    let mut parts = Vec::new();
188    let mut last_end = 0;
189    for pos in finder.find_iter(haystack.as_bytes()) {
190        parts.push(Some(haystack[last_end..pos].to_string()));
191        last_end = pos + needle.len();
192    }
193    parts.push(Some(haystack[last_end..].to_string()));
194    parts
195}
196
197// endregion
198
199// region: Approximate (fuzzy) matching helpers
200
201/// Compute the Levenshtein edit distance between two strings.
202/// Uses a single-row DP approach (O(min(m,n)) space).
203fn levenshtein_distance(a: &str, b: &str) -> usize {
204    let a_chars: Vec<char> = a.chars().collect();
205    let b_chars: Vec<char> = b.chars().collect();
206    let m = a_chars.len();
207    let n = b_chars.len();
208
209    if m == 0 {
210        return n;
211    }
212    if n == 0 {
213        return m;
214    }
215
216    // Use shorter string as the column dimension for O(min(m,n)) space.
217    // row_chars[i-1] is indexed by the outer loop (length = rows),
218    // col_chars[j-1] is indexed by the inner loop (length = cols).
219    let (row_chars, col_chars, rows, cols) = if m <= n {
220        (&b_chars, &a_chars, n, m)
221    } else {
222        (&a_chars, &b_chars, m, n)
223    };
224
225    let mut prev_row: Vec<usize> = (0..=cols).collect();
226    let mut curr_row: Vec<usize> = vec![0; cols + 1];
227
228    for i in 1..=rows {
229        curr_row[0] = i;
230        for j in 1..=cols {
231            let cost = if row_chars[i - 1] == col_chars[j - 1] {
232                0
233            } else {
234                1
235            };
236            curr_row[j] = (prev_row[j] + 1)
237                .min(curr_row[j - 1] + 1)
238                .min(prev_row[j - 1] + cost);
239        }
240        std::mem::swap(&mut prev_row, &mut curr_row);
241    }
242
243    prev_row[cols]
244}
245
246/// Check whether `haystack` contains a substring approximately matching `needle`
247/// within the given maximum edit distance. Uses a sliding-window approach:
248/// for each window of length `needle.len() +/- max_dist`, compute Levenshtein distance.
249fn approximate_contains(haystack: &str, needle: &str, max_dist: usize) -> bool {
250    let needle_chars: Vec<char> = needle.chars().collect();
251    let haystack_chars: Vec<char> = haystack.chars().collect();
252    let n_len = needle_chars.len();
253    let h_len = haystack_chars.len();
254
255    if n_len == 0 {
256        return true;
257    }
258
259    // Check windows of varying sizes around the needle length
260    let min_window = n_len.saturating_sub(max_dist);
261    let max_window = (n_len + max_dist).min(h_len);
262
263    for window_size in min_window..=max_window {
264        if window_size > h_len {
265            break;
266        }
267        for start in 0..=(h_len - window_size) {
268            let window: String = haystack_chars[start..start + window_size].iter().collect();
269            let needle_str: String = needle_chars.iter().collect();
270            if levenshtein_distance(&window, &needle_str) <= max_dist {
271                return true;
272            }
273        }
274    }
275
276    false
277}
278
279/// Parse R's `max.distance` argument for agrep/agrepl.
280/// R accepts either a single numeric (fraction of pattern length if < 1, absolute if >= 1)
281/// or a named list. We support the simple numeric case.
282fn parse_max_distance(named: &[(String, RValue)], pattern_len: usize) -> usize {
283    let dist = named
284        .iter()
285        .find(|(n, _)| n == "max.distance")
286        .and_then(|(_, v)| v.as_vector()?.as_double_scalar());
287
288    match dist {
289        Some(d) if d < 1.0 => {
290            // Fraction of pattern length (R default is 0.1)
291            let computed = (d * pattern_len as f64).floor() as usize;
292            computed.max(0)
293        }
294        Some(d) => {
295            // Absolute distance
296            d.floor() as usize
297        }
298        None => {
299            // R default: max.distance = 0.1 (10% of pattern length)
300            let computed = (0.1 * pattern_len as f64).floor() as usize;
301            computed.max(0)
302        }
303    }
304}
305
306// endregion
307
308/// Extract substrings from character strings.
309///
310/// Vectorized over x, start, and stop with recycling.
311///
312/// @param x character vector to extract from
313/// @param start integer vector of starting positions (1-indexed)
314/// @param stop integer vector of ending positions (inclusive)
315/// @return character vector containing the substrings
316#[builtin(min_args = 3, names = ["substring"])]
317fn builtin_substr(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
318    let x_vec = args
319        .first()
320        .and_then(|v| v.as_vector())
321        .map(|v| v.to_characters())
322        .unwrap_or_default();
323    let start_vec = args
324        .get(1)
325        .and_then(|v| v.as_vector())
326        .map(|v| v.to_integers())
327        .unwrap_or_else(|| vec![Some(1)]);
328    let stop_vec = args
329        .get(2)
330        .and_then(|v| v.as_vector())
331        .map(|v| v.to_integers())
332        .unwrap_or_default();
333
334    if x_vec.is_empty() {
335        return Ok(RValue::vec(Vector::Character(vec![].into())));
336    }
337
338    let n = x_vec.len().max(start_vec.len()).max(stop_vec.len());
339    let result: Vec<Option<String>> = (0..n)
340        .map(|i| {
341            let s_opt = &x_vec[i % x_vec.len()];
342            let start_opt = start_vec[i % start_vec.len()];
343            let stop_opt = stop_vec[i % stop_vec.len()];
344            match (s_opt, start_opt, stop_opt) {
345                (Some(s), Some(start), Some(stop)) => {
346                    let start = usize::try_from(start).unwrap_or(0);
347                    let stop = usize::try_from(stop).unwrap_or(0);
348                    let start = start.saturating_sub(1); // R is 1-indexed
349                    if start < s.len() {
350                        Some(s[start..stop.min(s.len())].to_string())
351                    } else {
352                        Some(String::new())
353                    }
354                }
355                (None, _, _) | (_, None, _) | (_, _, None) => None,
356            }
357        })
358        .collect();
359    Ok(RValue::vec(Vector::Character(result.into())))
360}
361
362/// Convert strings to upper case.
363///
364/// @param x character vector to convert
365/// @return character vector with all characters in upper case
366#[builtin(min_args = 1)]
367fn builtin_toupper(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
368    let vals = match args.first() {
369        Some(RValue::Vector(rv)) => rv.to_characters(),
370        Some(RValue::Null) => return Ok(RValue::vec(Vector::Character(vec![].into()))),
371        _ => {
372            return Err(RError::new(
373                RErrorKind::Argument,
374                "argument is not character".to_string(),
375            ))
376        }
377    };
378    let result: Vec<Option<String>> = vals
379        .iter()
380        .map(|s| s.as_ref().map(|s| s.to_uppercase()))
381        .collect();
382    Ok(RValue::vec(Vector::Character(result.into())))
383}
384
385/// Convert strings to lower case.
386///
387/// @param x character vector to convert
388/// @return character vector with all characters in lower case
389#[builtin(min_args = 1)]
390fn builtin_tolower(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
391    let vals = match args.first() {
392        Some(RValue::Vector(rv)) => rv.to_characters(),
393        Some(RValue::Null) => return Ok(RValue::vec(Vector::Character(vec![].into()))),
394        _ => {
395            return Err(RError::new(
396                RErrorKind::Argument,
397                "argument is not character".to_string(),
398            ))
399        }
400    };
401    let result: Vec<Option<String>> = vals
402        .iter()
403        .map(|s| s.as_ref().map(|s| s.to_lowercase()))
404        .collect();
405    Ok(RValue::vec(Vector::Character(result.into())))
406}
407
408/// Remove leading and/or trailing whitespace from strings.
409///
410/// @param x character vector to trim
411/// @param which character scalar: "both", "left", or "right" (default "both")
412/// @param whitespace character scalar: regex pattern of whitespace characters
413///   to trim (default `"[ \\t\\r\\n]"`)
414/// @return character vector with whitespace removed
415#[builtin(min_args = 1)]
416fn builtin_trimws(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
417    let which = named
418        .iter()
419        .find(|(n, _)| n == "which")
420        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
421        .or_else(|| {
422            args.get(1)
423                .and_then(|v| v.as_vector()?.as_character_scalar())
424        })
425        .unwrap_or_else(|| "both".to_string());
426
427    let whitespace_pat = named
428        .iter()
429        .find(|(n, _)| n == "whitespace")
430        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
431        .or_else(|| {
432            args.get(2)
433                .and_then(|v| v.as_vector()?.as_character_scalar())
434        })
435        .unwrap_or_else(|| "[ \\t\\r\\n]".to_string());
436
437    // If using default whitespace, use fast built-in trim; otherwise use regex
438    let use_regex = whitespace_pat != "[ \\t\\r\\n]";
439
440    if use_regex {
441        // Build regex anchored patterns for leading/trailing whitespace
442        let left_re = Regex::new(&format!("^({whitespace_pat})+")).map_err(|e| {
443            RError::new(
444                RErrorKind::Argument,
445                format!("invalid 'whitespace' regex pattern: {e}"),
446            )
447        })?;
448        let right_re = Regex::new(&format!("({whitespace_pat})+$")).map_err(|e| {
449            RError::new(
450                RErrorKind::Argument,
451                format!("invalid 'whitespace' regex pattern: {e}"),
452            )
453        })?;
454
455        match args.first() {
456            Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
457                let Vector::Character(vals) = &rv.inner else {
458                    unreachable!()
459                };
460                let result: Vec<Option<String>> = vals
461                    .iter()
462                    .map(|s| {
463                        s.as_ref().map(|s| {
464                            let s = match which.as_str() {
465                                "both" | "left" => left_re.replace(s, "").into_owned(),
466                                _ => s.to_string(),
467                            };
468                            match which.as_str() {
469                                "both" | "right" => right_re.replace(&s, "").into_owned(),
470                                _ => s,
471                            }
472                        })
473                    })
474                    .collect();
475                Ok(RValue::vec(Vector::Character(result.into())))
476            }
477            _ => Err(RError::new(
478                RErrorKind::Argument,
479                "argument is not character".to_string(),
480            )),
481        }
482    } else {
483        let trim_fn: fn(&str) -> &str = match which.as_str() {
484            "both" => str::trim,
485            "left" => str::trim_start,
486            "right" => str::trim_end,
487            _ => {
488                return Err(RError::new(
489                    RErrorKind::Argument,
490                    format!(
491                        "invalid 'which' argument: {:?} — must be \"both\", \"left\", or \"right\"",
492                        which
493                    ),
494                ))
495            }
496        };
497        match args.first() {
498            Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
499                let Vector::Character(vals) = &rv.inner else {
500                    unreachable!()
501                };
502                let result: Vec<Option<String>> = vals
503                    .iter()
504                    .map(|s| s.as_ref().map(|s| trim_fn(s).to_string()))
505                    .collect();
506                Ok(RValue::vec(Vector::Character(result.into())))
507            }
508            _ => Err(RError::new(
509                RErrorKind::Argument,
510                "argument is not character".to_string(),
511            )),
512        }
513    }
514}
515
516/// Replace all occurrences of a pattern in character strings.
517///
518/// @param pattern character scalar: regular expression or fixed string
519/// @param replacement character scalar: replacement text
520/// @param x character vector to search
521/// @param fixed logical: if TRUE, pattern is a literal string
522/// @param ignore.case logical: if TRUE, matching is case-insensitive
523/// @return character vector with all matches replaced
524#[builtin(min_args = 3)]
525fn builtin_gsub(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
526    // NULL input → character(0)
527    if matches!(args.get(2), Some(RValue::Null) | None) {
528        return Ok(RValue::vec(Vector::Character(vec![].into())));
529    }
530    let pattern = args
531        .first()
532        .and_then(|v| v.as_vector()?.as_character_scalar())
533        .unwrap_or_default();
534    let replacement = args
535        .get(1)
536        .and_then(|v| v.as_vector()?.as_character_scalar())
537        .unwrap_or_default();
538    let (fixed, ignore_case) = get_regex_opts(named);
539
540    // Fast path: fixed=TRUE uses AhoCorasick (handles ignore.case too)
541    if fixed {
542        // Empty-pattern gsub has special R semantics — keep using the dedicated helper
543        if pattern.is_empty() && !ignore_case {
544            return match args.get(2) {
545                Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
546                    let Vector::Character(vals) = &rv.inner else {
547                        unreachable!()
548                    };
549                    let result: Vec<Option<String>> = vals
550                        .iter()
551                        .map(|s| s.as_ref().map(|s| fixed_gsub(s, &pattern, &replacement)))
552                        .collect();
553                    Ok(RValue::vec(Vector::Character(result.into())))
554                }
555                _ => Err(RError::new(
556                    RErrorKind::Argument,
557                    "argument is not character".to_string(),
558                )),
559            };
560        }
561        let ac = build_fixed_searcher(&pattern, ignore_case)?;
562        return match args.get(2) {
563            Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
564                let Vector::Character(vals) = &rv.inner else {
565                    unreachable!()
566                };
567                let result: Vec<Option<String>> = vals
568                    .iter()
569                    .map(|s| {
570                        s.as_ref()
571                            .map(|s| ac.replace_all(s, &[replacement.as_str()]))
572                    })
573                    .collect();
574                Ok(RValue::vec(Vector::Character(result.into())))
575            }
576            _ => Err(RError::new(
577                RErrorKind::Argument,
578                "argument is not character".to_string(),
579            )),
580        };
581    }
582
583    let re = build_regex(&pattern, fixed, ignore_case)?;
584    let repl = convert_replacement(&replacement);
585    match args.get(2) {
586        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
587            let Vector::Character(vals) = &rv.inner else {
588                unreachable!()
589            };
590            let result: Vec<Option<String>> = vals
591                .iter()
592                .map(|s| {
593                    s.as_ref()
594                        .map(|s| re.replace_all(s, repl.as_str()).into_owned())
595                })
596                .collect();
597            Ok(RValue::vec(Vector::Character(result.into())))
598        }
599        _ => Err(RError::new(
600            RErrorKind::Argument,
601            "argument is not character".to_string(),
602        )),
603    }
604}
605
606/// Replace the first occurrence of a pattern in character strings.
607///
608/// @param pattern character scalar: regular expression or fixed string
609/// @param replacement character scalar: replacement text
610/// @param x character vector to search
611/// @param fixed logical: if TRUE, pattern is a literal string
612/// @param ignore.case logical: if TRUE, matching is case-insensitive
613/// @return character vector with first match replaced
614#[builtin(min_args = 3)]
615fn builtin_sub(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
616    // NULL input → character(0)
617    if matches!(args.get(2), Some(RValue::Null) | None) {
618        return Ok(RValue::vec(Vector::Character(vec![].into())));
619    }
620    let pattern = args
621        .first()
622        .and_then(|v| v.as_vector()?.as_character_scalar())
623        .unwrap_or_default();
624    let replacement = args
625        .get(1)
626        .and_then(|v| v.as_vector()?.as_character_scalar())
627        .unwrap_or_default();
628    let (fixed, ignore_case) = get_regex_opts(named);
629
630    // Fast path: fixed=TRUE uses AhoCorasick (handles ignore.case too)
631    if fixed {
632        let ac = build_fixed_searcher(&pattern, ignore_case)?;
633        return match args.get(2) {
634            Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
635                let Vector::Character(vals) = &rv.inner else {
636                    unreachable!()
637                };
638                let result: Vec<Option<String>> = vals
639                    .iter()
640                    .map(|s| {
641                        s.as_ref().map(|s| {
642                            // Replace only the first match
643                            match ac.find(s) {
644                                Some(m) => {
645                                    let mut out = String::with_capacity(
646                                        s.len() - m.len() + replacement.len(),
647                                    );
648                                    out.push_str(&s[..m.start()]);
649                                    out.push_str(&replacement);
650                                    out.push_str(&s[m.end()..]);
651                                    out
652                                }
653                                None => s.to_string(),
654                            }
655                        })
656                    })
657                    .collect();
658                Ok(RValue::vec(Vector::Character(result.into())))
659            }
660            _ => Err(RError::new(
661                RErrorKind::Argument,
662                "argument is not character".to_string(),
663            )),
664        };
665    }
666
667    let re = build_regex(&pattern, fixed, ignore_case)?;
668    let repl = convert_replacement(&replacement);
669    match args.get(2) {
670        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
671            let Vector::Character(vals) = &rv.inner else {
672                unreachable!()
673            };
674            let result: Vec<Option<String>> = vals
675                .iter()
676                .map(|s| {
677                    s.as_ref()
678                        .map(|s| re.replace(s, repl.as_str()).into_owned())
679                })
680                .collect();
681            Ok(RValue::vec(Vector::Character(result.into())))
682        }
683        _ => Err(RError::new(
684            RErrorKind::Argument,
685            "argument is not character".to_string(),
686        )),
687    }
688}
689
690/// Test whether a pattern matches each element of a character vector.
691///
692/// @param pattern character scalar: regular expression or fixed string
693/// @param x character vector to search
694/// @param fixed logical: if TRUE, pattern is a literal string
695/// @param ignore.case logical: if TRUE, matching is case-insensitive
696/// @return logical vector indicating which elements match
697#[builtin(min_args = 2)]
698fn builtin_grepl(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
699    if matches!(args.get(1), Some(RValue::Null) | None) {
700        return Ok(RValue::vec(Vector::Logical(vec![].into())));
701    }
702    let pattern = args
703        .first()
704        .and_then(|v| v.as_vector()?.as_character_scalar())
705        .unwrap_or_default();
706    let (fixed, ignore_case) = get_regex_opts(named);
707
708    // Fast path: fixed=TRUE uses AhoCorasick automaton (built once, amortized across vector)
709    if fixed {
710        let ac = build_fixed_searcher(&pattern, ignore_case)?;
711        return match args.get(1) {
712            Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
713                let Vector::Character(vals) = &rv.inner else {
714                    unreachable!()
715                };
716                let result: Vec<Option<bool>> = vals
717                    .iter()
718                    .map(|s| s.as_ref().map(|s| ac.is_match(s)))
719                    .collect();
720                Ok(RValue::vec(Vector::Logical(result.into())))
721            }
722            _ => Err(RError::new(
723                RErrorKind::Argument,
724                "argument is not character".to_string(),
725            )),
726        };
727    }
728
729    let re = build_regex(&pattern, fixed, ignore_case)?;
730    match args.get(1) {
731        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
732            let Vector::Character(vals) = &rv.inner else {
733                unreachable!()
734            };
735            let result: Vec<Option<bool>> = vals
736                .iter()
737                .map(|s| s.as_ref().map(|s| re.is_match(s)))
738                .collect();
739            Ok(RValue::vec(Vector::Logical(result.into())))
740        }
741        _ => Err(RError::new(
742            RErrorKind::Argument,
743            "argument is not character".to_string(),
744        )),
745    }
746}
747
748/// Search for pattern matches in a character vector.
749///
750/// @param pattern character scalar: regular expression or fixed string
751/// @param x character vector to search
752/// @param value logical: if TRUE, return matching elements instead of indices
753/// @param fixed logical: if TRUE, pattern is a literal string
754/// @param ignore.case logical: if TRUE, matching is case-insensitive
755/// @return integer vector of indices (default) or character vector of matching elements
756#[builtin(min_args = 2)]
757fn builtin_grep(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
758    if matches!(args.get(1), Some(RValue::Null) | None) {
759        return Ok(RValue::vec(Vector::Integer(vec![].into())));
760    }
761    let pattern = args
762        .first()
763        .and_then(|v| v.as_vector()?.as_character_scalar())
764        .unwrap_or_default();
765    let value = named
766        .iter()
767        .find(|(n, _)| n == "value")
768        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
769        .unwrap_or(false);
770    let (fixed, ignore_case) = get_regex_opts(named);
771
772    // Fast path: fixed=TRUE uses AhoCorasick automaton (built once, amortized across vector)
773    if fixed {
774        let ac = build_fixed_searcher(&pattern, ignore_case)?;
775        return match args.get(1) {
776            Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
777                let Vector::Character(vals) = &rv.inner else {
778                    unreachable!()
779                };
780                if value {
781                    let result: Vec<Option<String>> = vals
782                        .iter()
783                        .filter(|s| s.as_ref().map(|s| ac.is_match(s)).unwrap_or(false))
784                        .cloned()
785                        .collect();
786                    Ok(RValue::vec(Vector::Character(result.into())))
787                } else {
788                    let result: Result<Vec<Option<i64>>, RError> = vals
789                        .iter()
790                        .enumerate()
791                        .filter(|(_, s)| s.as_ref().map(|s| ac.is_match(s)).unwrap_or(false))
792                        .map(|(i, _)| Ok(Some(i64::try_from(i)? + 1)))
793                        .collect();
794                    Ok(RValue::vec(Vector::Integer(result?.into())))
795                }
796            }
797            _ => Err(RError::new(
798                RErrorKind::Argument,
799                "argument is not character".to_string(),
800            )),
801        };
802    }
803
804    let re = build_regex(&pattern, fixed, ignore_case)?;
805
806    match args.get(1) {
807        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
808            let Vector::Character(vals) = &rv.inner else {
809                unreachable!()
810            };
811            if value {
812                let result: Vec<Option<String>> = vals
813                    .iter()
814                    .filter(|s| s.as_ref().map(|s| re.is_match(s)).unwrap_or(false))
815                    .cloned()
816                    .collect();
817                Ok(RValue::vec(Vector::Character(result.into())))
818            } else {
819                let result: Result<Vec<Option<i64>>, RError> = vals
820                    .iter()
821                    .enumerate()
822                    .filter(|(_, s)| s.as_ref().map(|s| re.is_match(s)).unwrap_or(false))
823                    .map(|(i, _)| Ok(Some(i64::try_from(i)? + 1)))
824                    .collect();
825                Ok(RValue::vec(Vector::Integer(result?.into())))
826            }
827        }
828        _ => Err(RError::new(
829            RErrorKind::Argument,
830            "argument is not character".to_string(),
831        )),
832    }
833}
834
835/// Approximate grep: search for approximate pattern matches in a character vector.
836///
837/// Uses Levenshtein edit distance to find elements that approximately match the pattern.
838/// By default, the maximum distance is 10% of the pattern length (R's default for max.distance = 0.1).
839///
840/// @param pattern character scalar: pattern to approximately match
841/// @param x character vector to search
842/// @param max.distance numeric: maximum edit distance (< 1 means fraction of pattern length)
843/// @param value logical: if TRUE, return matching elements instead of indices
844/// @param ignore.case logical: if TRUE, matching is case-insensitive
845/// @return integer vector of indices (default) or character vector of matching elements
846#[builtin(name = "agrep", min_args = 2)]
847fn builtin_agrep(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
848    let pattern = args
849        .first()
850        .and_then(|v| v.as_vector()?.as_character_scalar())
851        .unwrap_or_default();
852    let ignore_case = named
853        .iter()
854        .find(|(n, _)| n == "ignore.case")
855        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
856        .unwrap_or(false);
857    let value = named
858        .iter()
859        .find(|(n, _)| n == "value")
860        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
861        .unwrap_or(false);
862    let max_dist = parse_max_distance(named, pattern.chars().count());
863
864    let match_pattern = if ignore_case {
865        pattern.to_lowercase()
866    } else {
867        pattern.clone()
868    };
869
870    match args.get(1) {
871        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
872            let Vector::Character(vals) = &rv.inner else {
873                unreachable!()
874            };
875            if value {
876                let result: Vec<Option<String>> = vals
877                    .iter()
878                    .filter(|s| {
879                        s.as_ref()
880                            .map(|s| {
881                                let hay = if ignore_case {
882                                    s.to_lowercase()
883                                } else {
884                                    s.to_string()
885                                };
886                                approximate_contains(&hay, &match_pattern, max_dist)
887                            })
888                            .unwrap_or(false)
889                    })
890                    .cloned()
891                    .collect();
892                Ok(RValue::vec(Vector::Character(result.into())))
893            } else {
894                let result: Result<Vec<Option<i64>>, RError> = vals
895                    .iter()
896                    .enumerate()
897                    .filter(|(_, s)| {
898                        s.as_ref()
899                            .map(|s| {
900                                let hay = if ignore_case {
901                                    s.to_lowercase()
902                                } else {
903                                    s.to_string()
904                                };
905                                approximate_contains(&hay, &match_pattern, max_dist)
906                            })
907                            .unwrap_or(false)
908                    })
909                    .map(|(i, _)| Ok(Some(i64::try_from(i)? + 1)))
910                    .collect();
911                Ok(RValue::vec(Vector::Integer(result?.into())))
912            }
913        }
914        _ => Err(RError::new(
915            RErrorKind::Argument,
916            "argument is not character".to_string(),
917        )),
918    }
919}
920
921/// Approximate grepl: test whether a pattern approximately matches each element.
922///
923/// Uses Levenshtein edit distance for fuzzy matching. Returns a logical vector.
924///
925/// @param pattern character scalar: pattern to approximately match
926/// @param x character vector to search
927/// @param max.distance numeric: maximum edit distance (< 1 means fraction of pattern length)
928/// @param ignore.case logical: if TRUE, matching is case-insensitive
929/// @return logical vector indicating which elements approximately match
930#[builtin(name = "agrepl", min_args = 2)]
931fn builtin_agrepl(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
932    let pattern = args
933        .first()
934        .and_then(|v| v.as_vector()?.as_character_scalar())
935        .unwrap_or_default();
936    let ignore_case = named
937        .iter()
938        .find(|(n, _)| n == "ignore.case")
939        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
940        .unwrap_or(false);
941    let max_dist = parse_max_distance(named, pattern.chars().count());
942
943    let match_pattern = if ignore_case {
944        pattern.to_lowercase()
945    } else {
946        pattern.clone()
947    };
948
949    match args.get(1) {
950        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
951            let Vector::Character(vals) = &rv.inner else {
952                unreachable!()
953            };
954            let result: Vec<Option<bool>> = vals
955                .iter()
956                .map(|s| {
957                    s.as_ref().map(|s| {
958                        let hay = if ignore_case {
959                            s.to_lowercase()
960                        } else {
961                            s.to_string()
962                        };
963                        approximate_contains(&hay, &match_pattern, max_dist)
964                    })
965                })
966                .collect();
967            Ok(RValue::vec(Vector::Logical(result.into())))
968        }
969        _ => Err(RError::new(
970            RErrorKind::Argument,
971            "argument is not character".to_string(),
972        )),
973    }
974}
975
976/// Find the first match of a pattern in each element of a character vector.
977///
978/// @param pattern character scalar: regular expression or fixed string
979/// @param text character vector to search
980/// @param fixed logical: if TRUE, pattern is a literal string
981/// @param ignore.case logical: if TRUE, matching is case-insensitive
982/// @return integer vector of match positions with "match.length" attribute
983#[builtin(min_args = 2)]
984fn builtin_regexpr(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
985    let pattern = args
986        .first()
987        .and_then(|v| v.as_vector()?.as_character_scalar())
988        .unwrap_or_default();
989    let (fixed, ignore_case) = get_regex_opts(named);
990    let re = build_regex(&pattern, fixed, ignore_case)?;
991    match args.get(1) {
992        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
993            let Vector::Character(vals) = &rv.inner else {
994                unreachable!()
995            };
996            let mut positions = Vec::new();
997            let mut lengths = Vec::new();
998            for s in vals.iter() {
999                match s.as_ref().and_then(|s| re.find(s)) {
1000                    Some(m) => {
1001                        positions.push(Some(i64::try_from(m.start())? + 1)); // R is 1-indexed
1002                        lengths.push(Some(i64::try_from(m.len())?));
1003                    }
1004                    None => {
1005                        positions.push(Some(-1));
1006                        lengths.push(Some(-1));
1007                    }
1008                }
1009            }
1010            let mut rv = RVector::from(Vector::Integer(positions.into()));
1011            rv.set_attr(
1012                "match.length".to_string(),
1013                RValue::vec(Vector::Integer(lengths.into())),
1014            );
1015            Ok(RValue::Vector(rv))
1016        }
1017        _ => Err(RError::new(
1018            RErrorKind::Argument,
1019            "argument is not character".to_string(),
1020        )),
1021    }
1022}
1023
1024/// Find all matches of a pattern in each element of a character vector.
1025///
1026/// @param pattern character scalar: regular expression or fixed string
1027/// @param text character vector to search
1028/// @param fixed logical: if TRUE, pattern is a literal string
1029/// @param ignore.case logical: if TRUE, matching is case-insensitive
1030/// @return list of integer vectors with match positions and "match.length" attributes
1031#[builtin(min_args = 2)]
1032fn builtin_gregexpr(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1033    let pattern = args
1034        .first()
1035        .and_then(|v| v.as_vector()?.as_character_scalar())
1036        .unwrap_or_default();
1037    let (fixed, ignore_case) = get_regex_opts(named);
1038    let re = build_regex(&pattern, fixed, ignore_case)?;
1039    match args.get(1) {
1040        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
1041            let Vector::Character(vals) = &rv.inner else {
1042                unreachable!()
1043            };
1044            let mut list_items = Vec::new();
1045            for s in vals.iter() {
1046                let (positions, lengths): (Vec<Option<i64>>, Vec<Option<i64>>) = match s.as_ref() {
1047                    Some(s) => {
1048                        let matches: Vec<_> = re.find_iter(s).collect();
1049                        if matches.is_empty() {
1050                            (vec![Some(-1)], vec![Some(-1)])
1051                        } else {
1052                            let (positions, lengths): (Vec<_>, Vec<_>) = matches
1053                                .iter()
1054                                .map(|m| -> Result<_, RError> {
1055                                    Ok((
1056                                        Some(i64::try_from(m.start())? + 1),
1057                                        Some(i64::try_from(m.len())?),
1058                                    ))
1059                                })
1060                                .collect::<Result<Vec<_>, _>>()?
1061                                .into_iter()
1062                                .unzip();
1063                            (positions, lengths)
1064                        }
1065                    }
1066                    None => (vec![Some(-1)], vec![Some(-1)]),
1067                };
1068                let mut match_rv = RVector::from(Vector::Integer(positions.into()));
1069                match_rv.set_attr(
1070                    "match.length".to_string(),
1071                    RValue::vec(Vector::Integer(lengths.into())),
1072                );
1073                list_items.push((None, RValue::Vector(match_rv)));
1074            }
1075            Ok(RValue::List(RList::new(list_items)))
1076        }
1077        _ => Err(RError::new(
1078            RErrorKind::Argument,
1079            "argument is not character".to_string(),
1080        )),
1081    }
1082}
1083
1084/// Extract matched substrings from regexpr/gregexpr results.
1085///
1086/// @param x character vector that was searched
1087/// @param m match data from regexpr() or gregexpr()
1088/// @return character vector (for regexpr) or list (for gregexpr) of matched substrings
1089#[builtin(min_args = 2)]
1090fn builtin_regmatches(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
1091    let x = match args.first() {
1092        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
1093            let Vector::Character(vals) = &rv.inner else {
1094                unreachable!()
1095            };
1096            vals.clone()
1097        }
1098        _ => {
1099            return Err(RError::new(
1100                RErrorKind::Argument,
1101                "argument is not character".to_string(),
1102            ))
1103        }
1104    };
1105
1106    // Second arg is regexpr/gregexpr output
1107    match args.get(1) {
1108        // regexpr result: single integer vector with match.length attr
1109        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Integer(_)) => {
1110            let Vector::Integer(positions) = &rv.inner else {
1111                unreachable!()
1112            };
1113            let lengths = match rv.get_attr("match.length") {
1114                Some(RValue::Vector(lv)) => match &lv.inner {
1115                    Vector::Integer(l) => l.clone(),
1116                    _ => {
1117                        return Err(RError::new(
1118                            RErrorKind::Argument,
1119                            "invalid match data".to_string(),
1120                        ))
1121                    }
1122                },
1123                _ => {
1124                    return Err(RError::new(
1125                        RErrorKind::Argument,
1126                        "invalid match data".to_string(),
1127                    ))
1128                }
1129            };
1130            let mut result = Vec::new();
1131            for (i, pos) in positions.iter_opt().enumerate() {
1132                let p = pos.unwrap_or(-1);
1133                let l = lengths.get_opt(i).unwrap_or(-1);
1134                if p > 0 && l > 0 {
1135                    if let Some(Some(s)) = x.get(i) {
1136                        let start = usize::try_from(p - 1)?;
1137                        let end = start + usize::try_from(l)?;
1138                        if end <= s.len() {
1139                            result.push(Some(s[start..end].to_string()));
1140                        } else {
1141                            result.push(None);
1142                        }
1143                    } else {
1144                        result.push(None);
1145                    }
1146                }
1147                // If no match (p == -1), skip (R returns character(0) effectively)
1148            }
1149            Ok(RValue::vec(Vector::Character(result.into())))
1150        }
1151        // gregexpr result: list of integer vectors
1152        Some(RValue::List(list)) => {
1153            let mut list_items = Vec::new();
1154            for (i, (_, match_val)) in list.values.iter().enumerate() {
1155                let RValue::Vector(rv) = match_val else {
1156                    list_items.push((
1157                        None,
1158                        RValue::vec(Vector::Character(Vec::<Option<String>>::new().into())),
1159                    ));
1160                    continue;
1161                };
1162                let Vector::Integer(positions) = &rv.inner else {
1163                    list_items.push((
1164                        None,
1165                        RValue::vec(Vector::Character(Vec::<Option<String>>::new().into())),
1166                    ));
1167                    continue;
1168                };
1169                let lengths: Integer = match rv.get_attr("match.length") {
1170                    Some(RValue::Vector(lv)) => match &lv.inner {
1171                        Vector::Integer(l) => l.clone(),
1172                        _ => Integer::from_values(vec![]),
1173                    },
1174                    _ => Integer::from_values(vec![]),
1175                };
1176                let s = x.get(i).and_then(|s| s.as_ref());
1177                let mut matches = Vec::new();
1178                for (j, pos) in positions.iter_opt().enumerate() {
1179                    let p = pos.unwrap_or(-1);
1180                    let l = lengths.get_opt(j).unwrap_or(-1);
1181                    if p > 0 && l > 0 {
1182                        if let Some(s) = s {
1183                            let start = usize::try_from(p - 1)?;
1184                            let end = start + usize::try_from(l)?;
1185                            if end <= s.len() {
1186                                matches.push(Some(s[start..end].to_string()));
1187                            }
1188                        }
1189                    }
1190                }
1191                list_items.push((None, RValue::vec(Vector::Character(matches.into()))));
1192            }
1193            Ok(RValue::List(RList::new(list_items)))
1194        }
1195        _ => Err(RError::new(
1196            RErrorKind::Argument,
1197            "invalid match data".to_string(),
1198        )),
1199    }
1200}
1201
1202/// Replace matched substrings using regexpr/gregexpr match data.
1203///
1204/// Called as `regmatches(x, m) <- value`. The replacement function receives
1205/// args `(x, m, value)`:
1206/// - For regexpr data: value is a character vector of replacements
1207/// - For gregexpr data: value is a list of character vectors
1208///
1209/// @param x character vector that was searched
1210/// @param m match data from regexpr() or gregexpr()
1211/// @param value replacement strings
1212/// @return character vector with matched substrings replaced
1213#[builtin(name = "regmatches<-", min_args = 3)]
1214fn builtin_regmatches_assign(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
1215    let x = match args.first() {
1216        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
1217            let Vector::Character(vals) = &rv.inner else {
1218                unreachable!()
1219            };
1220            vals.clone()
1221        }
1222        _ => {
1223            return Err(RError::new(
1224                RErrorKind::Argument,
1225                "argument 'x' is not character".to_string(),
1226            ))
1227        }
1228    };
1229
1230    let match_data = args.get(1).ok_or_else(|| {
1231        RError::new(
1232            RErrorKind::Argument,
1233            "missing match data argument".to_string(),
1234        )
1235    })?;
1236
1237    let value = args.get(2).ok_or_else(|| {
1238        RError::new(
1239            RErrorKind::Argument,
1240            "missing replacement value".to_string(),
1241        )
1242    })?;
1243
1244    match match_data {
1245        // regexpr result: single integer vector with match.length attr
1246        RValue::Vector(rv) if matches!(rv.inner, Vector::Integer(_)) => {
1247            let Vector::Integer(positions) = &rv.inner else {
1248                unreachable!()
1249            };
1250            let lengths: Integer = match rv.get_attr("match.length") {
1251                Some(RValue::Vector(lv)) => match &lv.inner {
1252                    Vector::Integer(l) => l.clone(),
1253                    _ => {
1254                        return Err(RError::new(
1255                            RErrorKind::Argument,
1256                            "invalid match data: match.length attribute is not integer".to_string(),
1257                        ))
1258                    }
1259                },
1260                _ => {
1261                    return Err(RError::new(
1262                        RErrorKind::Argument,
1263                        "invalid match data: missing match.length attribute".to_string(),
1264                    ))
1265                }
1266            };
1267            let replacements = match value {
1268                RValue::Vector(rv) => rv.to_characters(),
1269                _ => {
1270                    return Err(RError::new(
1271                        RErrorKind::Argument,
1272                        "replacement value must be a character vector".to_string(),
1273                    ))
1274                }
1275            };
1276
1277            let mut result: Vec<Option<String>> = x.to_vec();
1278            for (i, pos) in positions.iter_opt().enumerate() {
1279                let p = pos.unwrap_or(-1);
1280                let l = lengths.get_opt(i).unwrap_or(-1);
1281                if p > 0 && l >= 0 {
1282                    if let Some(Some(s)) = result.get(i) {
1283                        let start = usize::try_from(p - 1)?;
1284                        let end = start + usize::try_from(l)?;
1285                        if end <= s.len() {
1286                            let repl = replacements
1287                                .get(i)
1288                                .and_then(|r| r.as_ref())
1289                                .map(|r| r.as_str())
1290                                .unwrap_or("");
1291                            let new_s = format!("{}{}{}", &s[..start], repl, &s[end..]);
1292                            result[i] = Some(new_s);
1293                        }
1294                    }
1295                }
1296            }
1297            Ok(RValue::vec(Vector::Character(result.into())))
1298        }
1299        // gregexpr result: list of integer vectors
1300        RValue::List(match_list) => {
1301            let repl_list = match value {
1302                RValue::List(l) => l,
1303                _ => {
1304                    return Err(RError::new(
1305                        RErrorKind::Argument,
1306                        "replacement value must be a list for gregexpr match data".to_string(),
1307                    ))
1308                }
1309            };
1310
1311            let mut result: Vec<Option<String>> = x.to_vec();
1312            for (i, (_, match_val)) in match_list.values.iter().enumerate() {
1313                let RValue::Vector(rv) = match_val else {
1314                    continue;
1315                };
1316                let Vector::Integer(positions) = &rv.inner else {
1317                    continue;
1318                };
1319                let lengths: Integer = match rv.get_attr("match.length") {
1320                    Some(RValue::Vector(lv)) => match &lv.inner {
1321                        Vector::Integer(l) => l.clone(),
1322                        _ => Integer::from_values(vec![]),
1323                    },
1324                    _ => Integer::from_values(vec![]),
1325                };
1326
1327                let repls: Vec<Option<String>> = match repl_list.values.get(i) {
1328                    Some((_, RValue::Vector(rv))) => rv.to_characters().to_vec(),
1329                    _ => vec![],
1330                };
1331
1332                if let Some(Some(s)) = result.get(i) {
1333                    // Collect all match positions and replacements, process from right to left
1334                    // so earlier positions remain valid after replacement
1335                    let mut edits: Vec<(usize, usize, &str)> = Vec::new();
1336                    for (j, pos) in positions.iter_opt().enumerate() {
1337                        let p = pos.unwrap_or(-1);
1338                        let l = lengths.get_opt(j).unwrap_or(-1);
1339                        if p > 0 && l >= 0 {
1340                            let start = usize::try_from(p - 1)?;
1341                            let end = start + usize::try_from(l)?;
1342                            if end <= s.len() {
1343                                let repl = repls
1344                                    .get(j)
1345                                    .and_then(|r| r.as_ref())
1346                                    .map(|r| r.as_str())
1347                                    .unwrap_or("");
1348                                edits.push((start, end, repl));
1349                            }
1350                        }
1351                    }
1352                    // Apply edits right-to-left so byte offsets stay valid
1353                    edits.sort_by(|a, b| b.0.cmp(&a.0));
1354                    let mut new_s = s.clone();
1355                    for (start, end, repl) in edits {
1356                        new_s = format!("{}{}{}", &new_s[..start], repl, &new_s[end..]);
1357                    }
1358                    result[i] = Some(new_s);
1359                }
1360            }
1361            Ok(RValue::vec(Vector::Character(result.into())))
1362        }
1363        _ => Err(RError::new(
1364            RErrorKind::Argument,
1365            "invalid match data".to_string(),
1366        )),
1367    }
1368}
1369
1370/// Parsed format specifier from a `sprintf` format string.
1371struct FmtSpec {
1372    flags: String,
1373    width: Option<usize>,
1374    precision: Option<usize>,
1375    specifier: char,
1376}
1377
1378impl FmtSpec {
1379    /// Apply width and flags to an already-formatted value string.
1380    ///
1381    /// Uses `UnicodeWidthStr::width()` so that CJK characters and emoji
1382    /// (which occupy two terminal columns) are measured correctly.
1383    fn pad(&self, formatted: &str) -> String {
1384        let width = match self.width {
1385            Some(w) => w,
1386            None => return formatted.to_string(),
1387        };
1388        let display_width = UnicodeWidthStr::width(formatted);
1389        if display_width >= width {
1390            return formatted.to_string();
1391        }
1392        let pad_char = if self.flags.contains('0') && !self.flags.contains('-') {
1393            '0'
1394        } else {
1395            ' '
1396        };
1397        let pad_len = width.saturating_sub(display_width);
1398        if self.flags.contains('-') {
1399            // left-align: content then spaces
1400            format!("{}{}", formatted, " ".repeat(pad_len))
1401        } else if pad_char == '0' {
1402            // zero-pad: preserve leading sign/minus
1403            if let Some(rest) = formatted.strip_prefix('-') {
1404                let pad_len = width.saturating_sub(UnicodeWidthStr::width(rest) + 1);
1405                format!("-{}{}", "0".repeat(pad_len), rest)
1406            } else {
1407                format!("{}{}", "0".repeat(pad_len), formatted)
1408            }
1409        } else {
1410            // right-align with spaces
1411            format!("{}{}", " ".repeat(pad_len), formatted)
1412        }
1413    }
1414
1415    /// Format an integer value according to this specifier.
1416    fn format_int(&self, v: i64) -> String {
1417        let raw = if self.flags.contains('+') && v >= 0 {
1418            format!("+{}", v)
1419        } else {
1420            v.to_string()
1421        };
1422        self.pad(&raw)
1423    }
1424
1425    /// Format a float value according to this specifier.
1426    fn format_float(&self, v: f64) -> String {
1427        let prec = self.precision.unwrap_or(6);
1428        let raw = match self.specifier {
1429            'f' => {
1430                let s = format!("{:.prec$}", v, prec = prec);
1431                if self.flags.contains('+') && v >= 0.0 && !v.is_nan() {
1432                    format!("+{}", s)
1433                } else {
1434                    s
1435                }
1436            }
1437            'e' | 'E' => {
1438                let s = format_scientific(v, prec, self.specifier == 'E');
1439                if self.flags.contains('+') && v >= 0.0 && !v.is_nan() {
1440                    format!("+{}", s)
1441                } else {
1442                    s
1443                }
1444            }
1445            'g' | 'G' => {
1446                let s = format_g(v, prec, self.specifier == 'G');
1447                if self.flags.contains('+') && v >= 0.0 && !v.is_nan() {
1448                    format!("+{}", s)
1449                } else {
1450                    s
1451                }
1452            }
1453            _ => format!("{}", v),
1454        };
1455        self.pad(&raw)
1456    }
1457
1458    /// Format a string value according to this specifier.
1459    fn format_str(&self, v: &str) -> String {
1460        let truncated = match self.precision {
1461            Some(prec) => &v[..v.len().min(prec)],
1462            None => v,
1463        };
1464        self.pad(truncated)
1465    }
1466}
1467
1468/// Format a float in scientific notation matching R's output (two-digit exponent minimum).
1469fn format_scientific(v: f64, prec: usize, upper: bool) -> String {
1470    if v.is_nan() {
1471        return "NaN".to_string();
1472    }
1473    if v.is_infinite() {
1474        return if v > 0.0 {
1475            "Inf".to_string()
1476        } else {
1477            "-Inf".to_string()
1478        };
1479    }
1480    let e_char = if upper { 'E' } else { 'e' };
1481    if v == 0.0 {
1482        return format!("{:.prec$}{}{}", 0.0, e_char, "+00", prec = prec);
1483    }
1484    let abs_v = v.abs();
1485    let exp = abs_v.log10().floor() as i32;
1486    let mantissa = v / 10f64.powi(exp);
1487    format!("{:.prec$}{}{:+03}", mantissa, e_char, exp, prec = prec)
1488}
1489
1490/// Format using %g/%G: use shorter of %f and %e, removing trailing zeros.
1491fn format_g(v: f64, prec: usize, upper: bool) -> String {
1492    if v.is_nan() {
1493        return "NaN".to_string();
1494    }
1495    if v.is_infinite() {
1496        return if v > 0.0 {
1497            "Inf".to_string()
1498        } else {
1499            "-Inf".to_string()
1500        };
1501    }
1502    let prec = if prec == 0 { 1 } else { prec };
1503    if v == 0.0 {
1504        return "0".to_string();
1505    }
1506    let abs_v = v.abs();
1507    let exp = abs_v.log10().floor() as i32;
1508    // Use %e if exponent < -4 or >= precision
1509    if exp < -4 || exp >= i32::try_from(prec).unwrap_or(i32::MAX) {
1510        let sig_prec = prec.saturating_sub(1);
1511        let s = format_scientific(v, sig_prec, upper);
1512        // Remove trailing zeros in mantissa before the 'e'
1513        if let Some(e_pos) = s.find(if upper { 'E' } else { 'e' }) {
1514            let mantissa_part = s[..e_pos].trim_end_matches('0').trim_end_matches('.');
1515            format!("{}{}", mantissa_part, &s[e_pos..])
1516        } else {
1517            s
1518        }
1519    } else {
1520        // Use %f with enough decimal places
1521        let decimal_places = if exp >= 0 {
1522            prec.saturating_sub(usize::try_from(exp + 1).unwrap_or(0))
1523        } else {
1524            prec + usize::try_from(-exp - 1).unwrap_or(0)
1525        };
1526        let s = format!("{:.prec$}", v, prec = decimal_places);
1527        // Remove trailing zeros after decimal point
1528        if s.contains('.') {
1529            let trimmed = s.trim_end_matches('0').trim_end_matches('.');
1530            trimmed.to_string()
1531        } else {
1532            s
1533        }
1534    }
1535}
1536
1537/// Parse a format specifier starting after '%'. Returns (FmtSpec, chars consumed).
1538fn parse_fmt_spec(chars: &[char]) -> Option<(FmtSpec, usize)> {
1539    let mut i = 0;
1540
1541    // Parse flags
1542    let mut flags = String::new();
1543    while i < chars.len() && "-+ 0#".contains(chars[i]) {
1544        flags.push(chars[i]);
1545        i += 1;
1546    }
1547
1548    // Parse width
1549    let mut width = None;
1550    let width_start = i;
1551    while i < chars.len() && chars[i].is_ascii_digit() {
1552        i += 1;
1553    }
1554    if i > width_start {
1555        width = chars[width_start..i]
1556            .iter()
1557            .collect::<String>()
1558            .parse()
1559            .ok();
1560    }
1561
1562    // Parse precision
1563    let mut precision = None;
1564    if i < chars.len() && chars[i] == '.' {
1565        i += 1;
1566        let prec_start = i;
1567        while i < chars.len() && chars[i].is_ascii_digit() {
1568            i += 1;
1569        }
1570        precision = Some(
1571            chars[prec_start..i]
1572                .iter()
1573                .collect::<String>()
1574                .parse()
1575                .unwrap_or(0),
1576        );
1577    }
1578
1579    // Parse conversion specifier
1580    if i < chars.len() {
1581        let specifier = chars[i];
1582        i += 1;
1583        Some((
1584            FmtSpec {
1585                flags,
1586                width,
1587                precision,
1588                specifier,
1589            },
1590            i,
1591        ))
1592    } else {
1593        None
1594    }
1595}
1596
1597/// Collect the format specifiers from a format string, returning a list of
1598/// `(FmtSpec, arg_index)` pairs (0-based among the data args, i.e. excluding fmt).
1599fn collect_fmt_specs(fmt: &str) -> Vec<(FmtSpec, usize)> {
1600    let chars: Vec<char> = fmt.chars().collect();
1601    let mut specs = Vec::new();
1602    let mut i = 0;
1603    let mut arg_idx: usize = 0;
1604    while i < chars.len() {
1605        if chars[i] == '%' && i + 1 < chars.len() {
1606            i += 1;
1607            if chars[i] == '%' {
1608                i += 1;
1609                continue;
1610            }
1611            if let Some((spec, consumed)) = parse_fmt_spec(&chars[i..]) {
1612                i += consumed;
1613                match spec.specifier {
1614                    'd' | 'i' | 'f' | 'e' | 'E' | 'g' | 'G' | 's' => {
1615                        specs.push((spec, arg_idx));
1616                        arg_idx += 1;
1617                    }
1618                    _ => {}
1619                }
1620            }
1621        } else {
1622            i += 1;
1623        }
1624    }
1625    specs
1626}
1627
1628/// Format one string from the format template, using element `elem_idx` from
1629/// each data-arg vector (with recycling).
1630fn sprintf_one(fmt: &str, data_args: &[&Vector], elem_idx: usize) -> Result<String, RError> {
1631    let chars: Vec<char> = fmt.chars().collect();
1632    let mut output = String::new();
1633    let mut i = 0;
1634    let mut arg_idx: usize = 0;
1635
1636    while i < chars.len() {
1637        if chars[i] == '%' && i + 1 < chars.len() {
1638            i += 1;
1639            if chars[i] == '%' {
1640                output.push('%');
1641                i += 1;
1642                continue;
1643            }
1644            if let Some((spec, consumed)) = parse_fmt_spec(&chars[i..]) {
1645                i += consumed;
1646                match spec.specifier {
1647                    'd' | 'i' => {
1648                        let vec = data_args.get(arg_idx).ok_or_else(|| {
1649                            RError::new(
1650                                RErrorKind::Argument,
1651                                format!(
1652                                    "too few arguments for sprintf format: \
1653                                     need argument {} but only {} supplied",
1654                                    arg_idx + 1,
1655                                    data_args.len()
1656                                ),
1657                            )
1658                        })?;
1659                        let ints = vec.to_integers();
1660                        let v = if ints.is_empty() {
1661                            0
1662                        } else {
1663                            ints[elem_idx % ints.len()].unwrap_or(0)
1664                        };
1665                        output.push_str(&spec.format_int(v));
1666                        arg_idx += 1;
1667                    }
1668                    'f' | 'e' | 'E' | 'g' | 'G' => {
1669                        let vec = data_args.get(arg_idx).ok_or_else(|| {
1670                            RError::new(
1671                                RErrorKind::Argument,
1672                                format!(
1673                                    "too few arguments for sprintf format: \
1674                                     need argument {} but only {} supplied",
1675                                    arg_idx + 1,
1676                                    data_args.len()
1677                                ),
1678                            )
1679                        })?;
1680                        let doubles = vec.to_doubles();
1681                        let v = if doubles.is_empty() {
1682                            0.0
1683                        } else {
1684                            doubles[elem_idx % doubles.len()].unwrap_or(0.0)
1685                        };
1686                        output.push_str(&spec.format_float(v));
1687                        arg_idx += 1;
1688                    }
1689                    's' => {
1690                        let vec = data_args.get(arg_idx).ok_or_else(|| {
1691                            RError::new(
1692                                RErrorKind::Argument,
1693                                format!(
1694                                    "too few arguments for sprintf format: \
1695                                     need argument {} but only {} supplied",
1696                                    arg_idx + 1,
1697                                    data_args.len()
1698                                ),
1699                            )
1700                        })?;
1701                        let chars_vec = vec.to_characters();
1702                        let v = if chars_vec.is_empty() {
1703                            String::new()
1704                        } else {
1705                            chars_vec[elem_idx % chars_vec.len()]
1706                                .clone()
1707                                .unwrap_or_default()
1708                        };
1709                        output.push_str(&spec.format_str(&v));
1710                        arg_idx += 1;
1711                    }
1712                    _ => {
1713                        output.push('%');
1714                        output.push(spec.specifier);
1715                    }
1716                }
1717            }
1718        } else {
1719            output.push(chars[i]);
1720            i += 1;
1721        }
1722    }
1723    Ok(output)
1724}
1725
1726/// Format strings using C-style format specifiers, vectorized over arguments.
1727///
1728/// In R, `sprintf(fmt, ...)` is vectorized: if any argument is a vector of
1729/// length > 1, the result is a character vector of the same length (the
1730/// longest argument), with shorter arguments recycled.
1731///
1732/// @param fmt character scalar: format string with %d, %f, %s, etc.
1733/// @param ... values to substitute into the format string
1734/// @return character vector containing the formatted results
1735#[builtin(min_args = 1)]
1736fn builtin_sprintf(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
1737    let fmt_vec = args.first().and_then(|v| v.as_vector()).ok_or_else(|| {
1738        RError::new(
1739            RErrorKind::Argument,
1740            "sprintf requires a character format string".to_string(),
1741        )
1742    })?;
1743
1744    let fmt_chars = fmt_vec.to_characters();
1745
1746    // If format string is empty, return character(0)
1747    if fmt_chars.is_empty() {
1748        return Ok(RValue::vec(Vector::Character(
1749            Vec::<Option<String>>::new().into(),
1750        )));
1751    }
1752
1753    let fmt = fmt_chars[0].clone().unwrap_or_default();
1754
1755    // Collect data-arg vectors (args after the format string)
1756    let data_vecs: Vec<&Vector> = args[1..].iter().filter_map(|a| a.as_vector()).collect();
1757
1758    // If any data arg has length 0, return character(0) (R behavior)
1759    if data_vecs.iter().any(|v| v.is_empty()) {
1760        return Ok(RValue::vec(Vector::Character(
1761            Vec::<Option<String>>::new().into(),
1762        )));
1763    }
1764
1765    // Determine the output length: max of all data-arg lengths (minimum 1)
1766    let max_len = data_vecs.iter().map(|v| v.len()).max().unwrap_or(1);
1767
1768    // If there are no format specifiers that consume args, produce max_len copies
1769    // (or just 1 if no data args).  But if data_vecs is empty, just format once.
1770    let specs = collect_fmt_specs(&fmt);
1771    let output_len = if specs.is_empty() || data_vecs.is_empty() {
1772        if data_vecs.is_empty() {
1773            1
1774        } else {
1775            max_len
1776        }
1777    } else {
1778        max_len
1779    };
1780
1781    let mut results: Vec<Option<String>> = Vec::with_capacity(output_len);
1782    for elem_idx in 0..output_len {
1783        results.push(Some(sprintf_one(&fmt, &data_vecs, elem_idx)?));
1784    }
1785
1786    Ok(RValue::vec(Vector::Character(results.into())))
1787}
1788
1789// format() is in interp.rs (S3-dispatching interpreter builtin)
1790
1791/// Split strings by a pattern or fixed delimiter.
1792///
1793/// Vectorized over x: returns a list with one element per input string.
1794///
1795/// @param x character vector to split
1796/// @param split character scalar: pattern or fixed string to split on
1797/// @param fixed logical: if TRUE, split is a literal string
1798/// @param ignore.case logical: if TRUE, matching is case-insensitive
1799/// @return list of character vectors, one per input string
1800#[builtin(min_args = 2)]
1801fn builtin_strsplit(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
1802    let x_vec = args
1803        .first()
1804        .and_then(|v| v.as_vector())
1805        .map(|v| v.to_characters())
1806        .unwrap_or_default();
1807    let split = args
1808        .get(1)
1809        .and_then(|v| v.as_vector()?.as_character_scalar())
1810        .unwrap_or_default();
1811    let (fixed, ignore_case) = get_regex_opts(named);
1812
1813    // Pre-compile regex once if needed
1814    let re = if !split.is_empty() && (!fixed || ignore_case) {
1815        Some(build_regex(&split, fixed, ignore_case)?)
1816    } else {
1817        None
1818    };
1819
1820    let parts: Vec<(Option<String>, RValue)> = x_vec
1821        .into_iter()
1822        .map(|s_opt| {
1823            let elem = match s_opt {
1824                None => RValue::vec(Vector::Character(vec![None].into())),
1825                Some(s) => {
1826                    if split.is_empty() {
1827                        // Empty split: split into individual characters
1828                        let chars: Vec<Option<String>> =
1829                            s.chars().map(|c| Some(c.to_string())).collect();
1830                        RValue::vec(Vector::Character(chars.into()))
1831                    } else if fixed && !ignore_case {
1832                        // Fixed literal split, case-sensitive — memchr (SIMD-accelerated)
1833                        let pieces = fixed_split(&s, &split);
1834                        RValue::vec(Vector::Character(pieces.into()))
1835                    } else {
1836                        // Regex split
1837                        let pieces: Vec<Option<String>> = re
1838                            .as_ref()
1839                            .expect("regex compiled successfully above")
1840                            .split(&s)
1841                            .map(|p| Some(p.to_string()))
1842                            .collect();
1843                        RValue::vec(Vector::Character(pieces.into()))
1844                    }
1845                }
1846            };
1847            (None, elem)
1848        })
1849        .collect();
1850    Ok(RValue::List(RList::new(parts)))
1851}
1852
1853/// Test whether strings start with a given prefix.
1854///
1855/// Vectorized over both `x` and `prefix` with recycling.
1856///
1857/// @param x character vector to test
1858/// @param prefix character vector: the prefix(es) to look for
1859/// @return logical vector indicating which elements start with the prefix
1860#[builtin(name = "startsWith", min_args = 2)]
1861fn builtin_starts_with(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
1862    let x_vec = match args.first() {
1863        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
1864            let Vector::Character(vals) = &rv.inner else {
1865                unreachable!()
1866            };
1867            vals.clone()
1868        }
1869        _ => {
1870            return Err(RError::new(
1871                RErrorKind::Argument,
1872                "argument 'x' is not character".to_string(),
1873            ))
1874        }
1875    };
1876    let prefix_vec = args
1877        .get(1)
1878        .and_then(|v| v.as_vector())
1879        .map(|v| v.to_characters())
1880        .unwrap_or_default();
1881
1882    if x_vec.is_empty() || prefix_vec.is_empty() {
1883        return Ok(RValue::vec(Vector::Logical(
1884            Vec::<Option<bool>>::new().into(),
1885        )));
1886    }
1887
1888    let out_len = x_vec.len().max(prefix_vec.len());
1889    let result: Vec<Option<bool>> = (0..out_len)
1890        .map(|i| {
1891            let x_opt = &x_vec[i % x_vec.len()];
1892            let p_opt = &prefix_vec[i % prefix_vec.len()];
1893            match (x_opt, p_opt) {
1894                (Some(x), Some(p)) => Some(x.starts_with(p.as_str())),
1895                _ => None,
1896            }
1897        })
1898        .collect();
1899    Ok(RValue::vec(Vector::Logical(result.into())))
1900}
1901
1902/// Test whether strings end with a given suffix.
1903///
1904/// Vectorized over both `x` and `suffix` with recycling.
1905///
1906/// @param x character vector to test
1907/// @param suffix character vector: the suffix(es) to look for
1908/// @return logical vector indicating which elements end with the suffix
1909#[builtin(name = "endsWith", min_args = 2)]
1910fn builtin_ends_with(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
1911    let x_vec = match args.first() {
1912        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
1913            let Vector::Character(vals) = &rv.inner else {
1914                unreachable!()
1915            };
1916            vals.clone()
1917        }
1918        _ => {
1919            return Err(RError::new(
1920                RErrorKind::Argument,
1921                "argument 'x' is not character".to_string(),
1922            ))
1923        }
1924    };
1925    let suffix_vec = args
1926        .get(1)
1927        .and_then(|v| v.as_vector())
1928        .map(|v| v.to_characters())
1929        .unwrap_or_default();
1930
1931    if x_vec.is_empty() || suffix_vec.is_empty() {
1932        return Ok(RValue::vec(Vector::Logical(
1933            Vec::<Option<bool>>::new().into(),
1934        )));
1935    }
1936
1937    let out_len = x_vec.len().max(suffix_vec.len());
1938    let result: Vec<Option<bool>> = (0..out_len)
1939        .map(|i| {
1940            let x_opt = &x_vec[i % x_vec.len()];
1941            let s_opt = &suffix_vec[i % suffix_vec.len()];
1942            match (x_opt, s_opt) {
1943                (Some(x), Some(s)) => Some(x.ends_with(s.as_str())),
1944                _ => None,
1945            }
1946        })
1947        .collect();
1948    Ok(RValue::vec(Vector::Logical(result.into())))
1949}
1950
1951/// Translate characters in strings (character-by-character substitution).
1952///
1953/// Vectorized over x.
1954///
1955/// @param old character scalar: characters to replace
1956/// @param new character scalar: replacement characters (positionally matched)
1957/// @param x character vector: strings to translate
1958/// @return character vector with characters substituted
1959#[builtin(min_args = 3)]
1960fn builtin_chartr(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
1961    let old = args
1962        .first()
1963        .and_then(|v| v.as_vector()?.as_character_scalar())
1964        .unwrap_or_default();
1965    let new = args
1966        .get(1)
1967        .and_then(|v| v.as_vector()?.as_character_scalar())
1968        .unwrap_or_default();
1969    let x_vec = args
1970        .get(2)
1971        .and_then(|v| v.as_vector())
1972        .map(|v| v.to_characters())
1973        .unwrap_or_default();
1974    let old_chars: Vec<char> = old.chars().collect();
1975    let new_chars: Vec<char> = new.chars().collect();
1976    let result: Vec<Option<String>> = x_vec
1977        .into_iter()
1978        .map(|s_opt| {
1979            s_opt.map(|s| {
1980                s.chars()
1981                    .map(|c| {
1982                        if let Some(pos) = old_chars.iter().position(|&oc| oc == c) {
1983                            new_chars.get(pos).copied().unwrap_or(c)
1984                        } else {
1985                            c
1986                        }
1987                    })
1988                    .collect()
1989            })
1990        })
1991        .collect();
1992    Ok(RValue::vec(Vector::Character(result.into())))
1993}
1994
1995/// Make syntactically valid R names from character strings.
1996///
1997/// @param names character vector of names to sanitize
1998/// @return character vector of syntactically valid names
1999#[builtin(min_args = 1)]
2000fn builtin_make_names(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2001    match args.first() {
2002        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
2003            let Vector::Character(vals) = &rv.inner else {
2004                unreachable!()
2005            };
2006            let result: Vec<Option<String>> = vals
2007                .iter()
2008                .map(|s| {
2009                    s.as_ref().map(|s| {
2010                        let mut name = String::new();
2011                        for (i, c) in s.chars().enumerate() {
2012                            if i == 0 && c.is_ascii_digit() {
2013                                name.push('X');
2014                            }
2015                            if c.is_alphanumeric() || c == '.' || c == '_' {
2016                                name.push(c);
2017                            } else {
2018                                name.push('.');
2019                            }
2020                        }
2021                        if name.is_empty() {
2022                            name = "X".to_string();
2023                        }
2024                        name
2025                    })
2026                })
2027                .collect();
2028            Ok(RValue::vec(Vector::Character(result.into())))
2029        }
2030        _ => Ok(args.first().cloned().unwrap_or(RValue::Null)),
2031    }
2032}
2033
2034/// Make character strings unique by appending sequence numbers to duplicates.
2035///
2036/// @param names character vector of names to deduplicate
2037/// @return character vector with duplicates disambiguated (e.g. "x", "x.1", "x.2")
2038#[builtin(min_args = 1)]
2039fn builtin_make_unique(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2040    match args.first() {
2041        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
2042            let Vector::Character(vals) = &rv.inner else {
2043                unreachable!()
2044            };
2045            let mut result = Vec::new();
2046            let mut counts: HashMap<String, usize> = HashMap::new();
2047            for v in vals.iter() {
2048                if let Some(s) = v {
2049                    let count = counts.entry(s.clone()).or_insert(0);
2050                    if *count > 0 {
2051                        result.push(Some(format!("{}.{}", s, count)));
2052                    } else {
2053                        result.push(Some(s.clone()));
2054                    }
2055                    *count += 1;
2056                } else {
2057                    result.push(None);
2058                }
2059            }
2060            Ok(RValue::vec(Vector::Character(result.into())))
2061        }
2062        _ => Ok(args.first().cloned().unwrap_or(RValue::Null)),
2063    }
2064}
2065
2066/// Extract the file name from paths.
2067///
2068/// Vectorized over path.
2069///
2070/// @param path character vector of file paths
2071/// @return character vector containing the base file names
2072#[builtin(min_args = 1)]
2073fn builtin_basename(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2074    let path_vec = args
2075        .first()
2076        .and_then(|v| v.as_vector())
2077        .map(|v| v.to_characters())
2078        .unwrap_or_default();
2079    let result: Vec<Option<String>> = path_vec
2080        .into_iter()
2081        .map(|p_opt| {
2082            p_opt.map(|p| {
2083                std::path::Path::new(&p)
2084                    .file_name()
2085                    .map(|n| n.to_string_lossy().to_string())
2086                    .unwrap_or(p)
2087            })
2088        })
2089        .collect();
2090    Ok(RValue::vec(Vector::Character(result.into())))
2091}
2092
2093/// Extract the directory part from paths.
2094///
2095/// Vectorized over path.
2096///
2097/// @param path character vector of file paths
2098/// @return character vector containing the directory components
2099#[builtin(min_args = 1)]
2100fn builtin_dirname(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2101    let path_vec = args
2102        .first()
2103        .and_then(|v| v.as_vector())
2104        .map(|v| v.to_characters())
2105        .unwrap_or_default();
2106    let result: Vec<Option<String>> = path_vec
2107        .into_iter()
2108        .map(|p_opt| {
2109            p_opt.map(|p| {
2110                std::path::Path::new(&p)
2111                    .parent()
2112                    .map(|par| par.to_string_lossy().to_string())
2113                    .unwrap_or_else(|| ".".to_string())
2114            })
2115        })
2116        .collect();
2117    Ok(RValue::vec(Vector::Character(result.into())))
2118}
2119
2120/// Convert an R expression or value to its string representation.
2121///
2122/// @param expr any R value or language object
2123/// @return character scalar containing the deparsed representation
2124#[builtin(min_args = 1)]
2125fn builtin_deparse(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2126    let s = match args.first() {
2127        Some(RValue::Language(expr)) => deparse_expr(expr),
2128        Some(v) => format!("{}", v),
2129        None => "NULL".to_string(),
2130    };
2131    Ok(RValue::vec(Vector::Character(vec![Some(s)].into())))
2132}
2133
2134/// Single-string deparse (R 4.0+). Collapses multi-line deparse output.
2135///
2136/// @param expr any R value or language object
2137/// @param collapse character: separator for joining lines (default " ")
2138/// @return character scalar
2139#[builtin(name = "deparse1", min_args = 1)]
2140fn builtin_deparse1(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2141    let collapse = named
2142        .iter()
2143        .find(|(n, _)| n == "collapse")
2144        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
2145        .unwrap_or_else(|| " ".to_string());
2146
2147    let s = match args.first() {
2148        Some(RValue::Language(expr)) => deparse_expr(expr),
2149        Some(v) => format!("{v}"),
2150        None => "NULL".to_string(),
2151    };
2152    let collapsed = s.lines().collect::<Vec<_>>().join(&collapse);
2153    Ok(RValue::vec(Vector::Character(vec![Some(collapsed)].into())))
2154}
2155
2156/// Convert integer Unicode code points to a UTF-8 string.
2157///
2158/// @param x integer vector of Unicode code points
2159/// @return character scalar containing the corresponding UTF-8 string
2160#[builtin(name = "intToUtf8", min_args = 1)]
2161fn builtin_int_to_utf8(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2162    let ints = match args.first() {
2163        Some(RValue::Vector(rv)) => rv.to_integers(),
2164        _ => {
2165            return Err(RError::new(
2166                RErrorKind::Argument,
2167                "argument must be an integer vector".to_string(),
2168            ))
2169        }
2170    };
2171    let mut result = String::new();
2172    for val in &ints {
2173        match val {
2174            Some(code) if *code >= 0 => match char::from_u32(u32::try_from(*code)?) {
2175                Some(c) => result.push(c),
2176                None => {
2177                    return Err(RError::new(
2178                        RErrorKind::Argument,
2179                        format!("invalid Unicode code point: {}", code),
2180                    ))
2181                }
2182            },
2183            Some(code) => {
2184                return Err(RError::new(
2185                    RErrorKind::Argument,
2186                    format!("invalid Unicode code point: {}", code),
2187                ))
2188            }
2189            None => result.push('\u{FFFD}'), // replacement character for NA
2190        }
2191    }
2192    Ok(RValue::vec(Vector::Character(vec![Some(result)].into())))
2193}
2194
2195/// Convert a UTF-8 string to integer Unicode code points.
2196///
2197/// @param x character scalar to convert
2198/// @return integer vector of Unicode code points
2199#[builtin(name = "utf8ToInt", min_args = 1)]
2200fn builtin_utf8_to_int(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2201    let s = args
2202        .first()
2203        .and_then(|v| v.as_vector()?.as_character_scalar())
2204        .ok_or_else(|| {
2205            RError::new(
2206                RErrorKind::Argument,
2207                "argument must be a single string".to_string(),
2208            )
2209        })?;
2210    let result: Vec<Option<i64>> = s.chars().map(|c| Some(i64::from(u32::from(c)))).collect();
2211    Ok(RValue::vec(Vector::Integer(result.into())))
2212}
2213
2214/// Convert a character string to a raw (byte) vector.
2215///
2216/// @param x character scalar to convert
2217/// @return raw vector of the string's UTF-8 bytes
2218#[builtin(name = "charToRaw", min_args = 1)]
2219fn builtin_char_to_raw(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2220    let s = args
2221        .first()
2222        .and_then(|v| v.as_vector()?.as_character_scalar())
2223        .ok_or_else(|| RError::new(RErrorKind::Argument, "argument must be a single string"))?;
2224    let result: Vec<u8> = s.bytes().collect();
2225    Ok(RValue::vec(Vector::Raw(result)))
2226}
2227
2228/// Convert a raw (byte) vector to a character string.
2229///
2230/// Uses bstr for graceful handling of non-UTF-8 bytes: invalid sequences
2231/// are replaced with the Unicode replacement character (U+FFFD) rather than
2232/// producing an error, matching R's tolerant behavior with `rawToChar()`.
2233///
2234/// When `multiple = TRUE`, each byte becomes a separate character element.
2235///
2236/// @param x raw vector to convert
2237/// @param multiple logical: if TRUE, return one string per byte (default FALSE)
2238/// @return character scalar (or vector if multiple=TRUE) containing the string
2239#[builtin(name = "rawToChar", min_args = 1)]
2240fn builtin_raw_to_char(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2241    let bytes = match args.first() {
2242        Some(RValue::Vector(rv)) => rv.inner.to_raw(),
2243        _ => {
2244            return Err(RError::new(
2245                RErrorKind::Argument,
2246                "argument must be a raw or integer vector",
2247            ))
2248        }
2249    };
2250
2251    let multiple = named
2252        .iter()
2253        .find(|(k, _)| k == "multiple")
2254        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
2255        .or_else(|| args.get(1).and_then(|v| v.as_vector()?.as_logical_scalar()))
2256        .unwrap_or(false);
2257
2258    if multiple {
2259        // Each byte becomes a separate character element
2260        let result: Vec<Option<String>> = bytes
2261            .iter()
2262            .map(|&b| {
2263                // Strip embedded NULs: R silently drops \0 from rawToChar output
2264                if b == 0 {
2265                    Some(String::new())
2266                } else {
2267                    Some(std::slice::from_ref(&b).to_str_lossy().into_owned())
2268                }
2269            })
2270            .collect();
2271        Ok(RValue::vec(Vector::Character(result.into())))
2272    } else {
2273        // Strip NUL bytes (R strips embedded NULs in rawToChar)
2274        let filtered: Vec<u8> = bytes.into_iter().filter(|&b| b != 0).collect();
2275        let s = filtered.as_bstr().to_str_lossy().into_owned();
2276        Ok(RValue::vec(Vector::Character(vec![Some(s)].into())))
2277    }
2278}
2279
2280/// `raw(length)` — create a raw (byte) vector of zeros.
2281///
2282/// `raw()` with no arguments returns an empty raw vector (length 0).
2283#[builtin]
2284fn builtin_raw(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2285    let n = args
2286        .first()
2287        .and_then(|v| v.as_vector()?.as_integer_scalar())
2288        .unwrap_or(0);
2289    if n < 0 {
2290        return Err(RError::new(
2291            RErrorKind::Argument,
2292            format!("invalid 'length' argument: {}", n),
2293        ));
2294    }
2295    let len = usize::try_from(n)?;
2296    Ok(RValue::vec(Vector::Raw(vec![0u8; len])))
2297}
2298
2299/// `rawShift(x, n)` — bitwise shift of raw (byte) values.
2300/// Positive n shifts left, negative n shifts right.
2301#[builtin(name = "rawShift", min_args = 2)]
2302fn builtin_raw_shift(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2303    let bytes = match args.first() {
2304        Some(RValue::Vector(rv)) => rv.inner.to_raw(),
2305        _ => {
2306            return Err(RError::new(
2307                RErrorKind::Argument,
2308                "argument 'x' must be a raw vector",
2309            ))
2310        }
2311    };
2312    let shift = args
2313        .get(1)
2314        .and_then(|v| v.as_vector()?.as_integer_scalar())
2315        .ok_or_else(|| {
2316            RError::new(
2317                RErrorKind::Argument,
2318                "argument 'n' must be a single integer",
2319            )
2320        })?;
2321    if !(-8..=8).contains(&shift) {
2322        return Err(RError::new(
2323            RErrorKind::Argument,
2324            format!("shift amount must be between -8 and 8, got {}", shift),
2325        ));
2326    }
2327
2328    let result: Vec<u8> = bytes
2329        .iter()
2330        .map(|&byte| {
2331            if shift >= 0 {
2332                byte.wrapping_shl(u32::try_from(shift).unwrap_or(0))
2333            } else {
2334                byte.wrapping_shr(u32::try_from(-shift).unwrap_or(0))
2335            }
2336        })
2337        .collect();
2338    Ok(RValue::vec(Vector::Raw(result)))
2339}
2340
2341/// `as.raw(x)` — coerce to raw (byte) values (0-255), truncating to lowest byte.
2342#[builtin(name = "as.raw", min_args = 1)]
2343fn builtin_as_raw(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2344    match args.first() {
2345        Some(RValue::Vector(rv)) => Ok(RValue::vec(Vector::Raw(rv.inner.to_raw()))),
2346        _ => Err(RError::new(
2347            RErrorKind::Argument,
2348            "argument must be a vector",
2349        )),
2350    }
2351}
2352
2353/// `is.raw(x)` — test if argument is a raw vector.
2354#[builtin(name = "is.raw", min_args = 1)]
2355fn builtin_is_raw(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2356    let is_raw =
2357        matches!(args.first(), Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Raw(_)));
2358    Ok(RValue::vec(Vector::Logical(vec![Some(is_raw)].into())))
2359}
2360
2361/// Convert a glob (wildcard) pattern to a regular expression.
2362///
2363/// @param pattern character scalar: a glob pattern with * and ? wildcards
2364/// @return character scalar containing the equivalent anchored regex
2365#[builtin(name = "glob2rx", min_args = 1)]
2366fn builtin_glob2rx(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2367    let pattern = args
2368        .first()
2369        .and_then(|v| v.as_vector()?.as_character_scalar())
2370        .ok_or_else(|| {
2371            RError::new(
2372                RErrorKind::Argument,
2373                "argument must be a character string".to_string(),
2374            )
2375        })?;
2376    let mut result = String::from("^");
2377    for c in pattern.chars() {
2378        match c {
2379            '*' => result.push_str(".*"),
2380            '?' => result.push('.'),
2381            '.' | '(' | ')' | '+' | '|' | '{' | '}' | '[' | ']' | '^' | '$' | '\\' => {
2382                result.push('\\');
2383                result.push(c);
2384            }
2385            _ => result.push(c),
2386        }
2387    }
2388    result.push('$');
2389    Ok(RValue::vec(Vector::Character(vec![Some(result)].into())))
2390}
2391
2392/// Match a pattern with capture groups against a character vector.
2393///
2394/// @param pattern character scalar: regular expression with optional capture groups
2395/// @param text character vector to search
2396/// @param fixed logical: if TRUE, pattern is a literal string
2397/// @param ignore.case logical: if TRUE, matching is case-insensitive
2398/// @return list of integer vectors with match and capture-group positions
2399#[builtin(min_args = 2)]
2400fn builtin_regexec(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2401    let pattern = args
2402        .first()
2403        .and_then(|v| v.as_vector()?.as_character_scalar())
2404        .unwrap_or_default();
2405    let (fixed, ignore_case) = get_regex_opts(named);
2406    let re = build_regex(&pattern, fixed, ignore_case)?;
2407    match args.get(1) {
2408        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
2409            let Vector::Character(vals) = &rv.inner else {
2410                unreachable!()
2411            };
2412            let mut list_items = Vec::new();
2413            for s in vals.iter() {
2414                match s.as_ref().and_then(|s| re.captures(s)) {
2415                    Some(caps) => {
2416                        let mut positions = Vec::new();
2417                        let mut lengths = Vec::new();
2418                        for i in 0..caps.len() {
2419                            match caps.get(i) {
2420                                Some(m) => {
2421                                    positions.push(Some(i64::try_from(m.start())? + 1));
2422                                    lengths.push(Some(i64::try_from(m.len())?));
2423                                }
2424                                None => {
2425                                    positions.push(Some(-1));
2426                                    lengths.push(Some(-1));
2427                                }
2428                            }
2429                        }
2430                        let mut match_rv = RVector::from(Vector::Integer(positions.into()));
2431                        match_rv.set_attr(
2432                            "match.length".to_string(),
2433                            RValue::vec(Vector::Integer(lengths.into())),
2434                        );
2435                        list_items.push((None, RValue::Vector(match_rv)));
2436                    }
2437                    None => {
2438                        let mut match_rv = RVector::from(Vector::Integer(vec![Some(-1)].into()));
2439                        match_rv.set_attr(
2440                            "match.length".to_string(),
2441                            RValue::vec(Vector::Integer(vec![Some(-1)].into())),
2442                        );
2443                        list_items.push((None, RValue::Vector(match_rv)));
2444                    }
2445                }
2446            }
2447            Ok(RValue::List(RList::new(list_items)))
2448        }
2449        _ => Err(RError::new(
2450            RErrorKind::Argument,
2451            "argument is not character".to_string(),
2452        )),
2453    }
2454}
2455
2456/// Write a deparsed representation of an R object to stdout.
2457///
2458/// @param x any R value or language object
2459/// @return NULL (invisibly); output is printed to stdout
2460#[interpreter_builtin(min_args = 1)]
2461fn interp_dput(
2462    args: &[RValue],
2463    _named: &[(String, RValue)],
2464    context: &BuiltinContext,
2465) -> Result<RValue, RError> {
2466    let s = match args.first() {
2467        Some(RValue::Language(expr)) => deparse_expr(expr),
2468        Some(v) => format!("{}", v),
2469        None => "NULL".to_string(),
2470    };
2471    context.write(&format!("{}\n", s));
2472    Ok(RValue::Null)
2473}
2474
2475/// Convert strings to integers using a specified base (radix).
2476///
2477/// Vectorized over x.
2478///
2479/// @param x character vector: the strings to parse
2480/// @param base integer scalar: the radix (default 10)
2481/// @return integer vector, NA where parsing fails
2482#[builtin(min_args = 1)]
2483fn builtin_strtoi(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2484    let x_vec = args
2485        .first()
2486        .and_then(|v| v.as_vector())
2487        .map(|v| v.to_characters())
2488        .unwrap_or_default();
2489    let base = named
2490        .iter()
2491        .find(|(n, _)| n == "base")
2492        .and_then(|(_, v)| v.as_vector()?.as_integer_scalar())
2493        .or_else(|| args.get(1).and_then(|v| v.as_vector()?.as_integer_scalar()))
2494        .unwrap_or(10);
2495    let base = u32::try_from(base)?;
2496    let result: Vec<Option<i64>> = x_vec
2497        .into_iter()
2498        .map(|s_opt| match s_opt {
2499            None => None,
2500            Some(s) => i64::from_str_radix(s.trim(), base).ok(),
2501        })
2502        .collect();
2503    Ok(RValue::vec(Vector::Integer(result.into())))
2504}
2505
2506/// Test whether character strings are non-empty.
2507///
2508/// @param x character vector to test
2509/// @return logical vector: TRUE for non-empty strings, TRUE for NA
2510#[builtin(min_args = 1)]
2511fn builtin_nzchar(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2512    match args.first() {
2513        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
2514            let Vector::Character(vals) = &rv.inner else {
2515                unreachable!()
2516            };
2517            let result: Vec<Option<bool>> = vals
2518                .iter()
2519                .map(|s| match s {
2520                    Some(s) => Some(!s.is_empty()),
2521                    None => Some(true),
2522                })
2523                .collect();
2524            Ok(RValue::vec(Vector::Logical(result.into())))
2525        }
2526        Some(val) => {
2527            let s = val
2528                .as_vector()
2529                .and_then(|v| v.as_character_scalar())
2530                .unwrap_or_default();
2531            Ok(RValue::vec(Vector::Logical(
2532                vec![Some(!s.is_empty())].into(),
2533            )))
2534        }
2535        None => Err(RError::new(
2536            RErrorKind::Argument,
2537            "argument is missing".to_string(),
2538        )),
2539    }
2540}
2541
2542/// Wrap strings in single (typographic) quotes.
2543///
2544/// @param x character vector to quote
2545/// @return character vector with elements wrapped in single curly quotes
2546#[builtin(name = "sQuote", min_args = 1)]
2547fn builtin_squote(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2548    match args.first() {
2549        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
2550            let Vector::Character(vals) = &rv.inner else {
2551                unreachable!()
2552            };
2553            let result: Vec<Option<String>> = vals
2554                .iter()
2555                .map(|s| s.as_ref().map(|s| format!("\u{2018}{}\u{2019}", s)))
2556                .collect();
2557            Ok(RValue::vec(Vector::Character(result.into())))
2558        }
2559        Some(val) => {
2560            let s = val
2561                .as_vector()
2562                .and_then(|v| v.as_character_scalar())
2563                .unwrap_or_default();
2564            Ok(RValue::vec(Vector::Character(
2565                vec![Some(format!("\u{2018}{}\u{2019}", s))].into(),
2566            )))
2567        }
2568        None => Ok(RValue::vec(Vector::Character(vec![None].into()))),
2569    }
2570}
2571
2572/// Wrap strings in double (typographic) quotes.
2573///
2574/// @param x character vector to quote
2575/// @return character vector with elements wrapped in double curly quotes
2576#[builtin(name = "dQuote", min_args = 1)]
2577fn builtin_dquote(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2578    match args.first() {
2579        Some(RValue::Vector(rv)) if matches!(rv.inner, Vector::Character(_)) => {
2580            let Vector::Character(vals) = &rv.inner else {
2581                unreachable!()
2582            };
2583            let result: Vec<Option<String>> = vals
2584                .iter()
2585                .map(|s| s.as_ref().map(|s| format!("\u{201C}{}\u{201D}", s)))
2586                .collect();
2587            Ok(RValue::vec(Vector::Character(result.into())))
2588        }
2589        Some(val) => {
2590            let s = val
2591                .as_vector()
2592                .and_then(|v| v.as_character_scalar())
2593                .unwrap_or_default();
2594            Ok(RValue::vec(Vector::Character(
2595                vec![Some(format!("\u{201C}{}\u{201D}", s))].into(),
2596            )))
2597        }
2598        _ => Ok(RValue::vec(Vector::Character(vec![None].into()))),
2599    }
2600}
2601
2602// region: strrep
2603
2604/// Repeat each element of a character vector a specified number of times.
2605///
2606/// @param x a character vector
2607/// @param times an integer vector of repetition counts (recycled)
2608/// @return a character vector with each element repeated
2609#[builtin(min_args = 2)]
2610fn builtin_strrep(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2611    let x = args.first().and_then(|v| v.as_vector()).ok_or_else(|| {
2612        RError::new(
2613            RErrorKind::Argument,
2614            "strrep() requires a character vector as first argument".to_string(),
2615        )
2616    })?;
2617    let times = args.get(1).and_then(|v| v.as_vector()).ok_or_else(|| {
2618        RError::new(
2619            RErrorKind::Argument,
2620            "strrep() requires an integer 'times' argument".to_string(),
2621        )
2622    })?;
2623
2624    let chars = x.to_characters();
2625    let ints = times.to_integers();
2626    let max_len = chars.len().max(ints.len());
2627
2628    let result: Vec<Option<String>> = (0..max_len)
2629        .map(|i| {
2630            let s = &chars[i % chars.len()];
2631            let n = ints[i % ints.len()];
2632            match (s, n) {
2633                (Some(s), Some(n)) => {
2634                    if n < 0 {
2635                        None // invalid repetition count
2636                    } else {
2637                        Some(s.repeat(usize::try_from(n).unwrap_or(0)))
2638                    }
2639                }
2640                _ => None,
2641            }
2642        })
2643        .collect();
2644
2645    Ok(RValue::vec(Vector::Character(result.into())))
2646}
2647
2648// endregion
2649
2650// region: formatC
2651
2652/// Formatted printing of numbers and strings, similar to C's printf family.
2653///
2654/// @param x numeric or character vector to format
2655/// @param width integer: minimum field width (default 0, meaning no padding)
2656/// @param format character: one of "d" (integer), "f" (fixed), "e" (scientific),
2657///   "g" (general), "s" (string). Default: "g" for numeric, "s" for character.
2658/// @param flag character: formatting flags — "-" left-justify, "+" always show sign,
2659///   " " leading space for positive numbers, "0" zero-pad. Default: ""
2660/// @param digits integer: number of significant or decimal digits (depends on format).
2661///   Default: 6.
2662/// @return character vector of formatted values
2663#[builtin(name = "formatC", min_args = 1)]
2664fn builtin_format_c(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2665    let x = args.first().and_then(|v| v.as_vector()).ok_or_else(|| {
2666        RError::new(
2667            RErrorKind::Argument,
2668            "formatC() requires a vector as first argument".to_string(),
2669        )
2670    })?;
2671
2672    // Extract named arguments with defaults
2673    let width: usize = named
2674        .iter()
2675        .find(|(k, _)| k == "width")
2676        .and_then(|(_, v)| v.as_vector()?.as_integer_scalar())
2677        .and_then(|i| usize::try_from(i).ok())
2678        .or_else(|| {
2679            args.get(1)
2680                .and_then(|v| v.as_vector()?.as_integer_scalar())
2681                .and_then(|i| usize::try_from(i).ok())
2682        })
2683        .unwrap_or(0);
2684
2685    let default_format = match x {
2686        Vector::Character(_) => "s",
2687        _ => "g",
2688    };
2689    let format = named
2690        .iter()
2691        .find(|(k, _)| k == "format")
2692        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
2693        .unwrap_or_else(|| default_format.to_string());
2694
2695    let flag = named
2696        .iter()
2697        .find(|(k, _)| k == "flag")
2698        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
2699        .unwrap_or_default();
2700
2701    let digits: usize = named
2702        .iter()
2703        .find(|(k, _)| k == "digits")
2704        .and_then(|(_, v)| v.as_vector()?.as_integer_scalar())
2705        .and_then(|i| usize::try_from(i).ok())
2706        .unwrap_or(6);
2707
2708    let spec = FmtSpec {
2709        flags: flag,
2710        width: if width > 0 { Some(width) } else { None },
2711        precision: Some(digits),
2712        specifier: format.chars().next().unwrap_or('g'),
2713    };
2714
2715    let result: Vec<Option<String>> = match &format[..] {
2716        "d" => {
2717            let ints = x.to_integers();
2718            ints.iter().map(|v| v.map(|i| spec.format_int(i))).collect()
2719        }
2720        "f" | "e" | "E" | "g" | "G" => {
2721            let doubles = x.to_doubles();
2722            doubles
2723                .iter()
2724                .map(|v| v.map(|f| spec.format_float(f)))
2725                .collect()
2726        }
2727        "s" => {
2728            let chars = x.to_characters();
2729            chars
2730                .iter()
2731                .map(|v| v.as_ref().map(|s| spec.format_str(s)))
2732                .collect()
2733        }
2734        _ => {
2735            return Err(RError::new(
2736                RErrorKind::Argument,
2737                format!(
2738                    "formatC(): invalid 'format' argument '{}'. \
2739                     Use one of: \"d\", \"f\", \"e\", \"g\", \"s\"",
2740                    format
2741                ),
2742            ));
2743        }
2744    };
2745
2746    Ok(RValue::vec(Vector::Character(result.into())))
2747}
2748
2749// endregion
2750
2751// region: format.pval
2752
2753/// Format p-values for display, showing e.g. "< 2.2e-16" for very small values.
2754///
2755/// @param pv numeric vector of p-values
2756/// @param digits integer: number of significant digits (default 3)
2757/// @param eps numeric: threshold below which to show "< eps" (default 2.220446e-16,
2758///   i.e. `.Machine$double.eps`)
2759/// @return character vector of formatted p-values
2760#[builtin(name = "format.pval", min_args = 1)]
2761fn builtin_format_pval(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2762    let x = args.first().and_then(|v| v.as_vector()).ok_or_else(|| {
2763        RError::new(
2764            RErrorKind::Argument,
2765            "format.pval() requires a numeric vector".to_string(),
2766        )
2767    })?;
2768
2769    let digits: usize = named
2770        .iter()
2771        .find(|(k, _)| k == "digits")
2772        .and_then(|(_, v)| v.as_vector()?.as_integer_scalar())
2773        .and_then(|i| usize::try_from(i).ok())
2774        .unwrap_or(3);
2775
2776    let eps: f64 = named
2777        .iter()
2778        .find(|(k, _)| k == "eps")
2779        .and_then(|(_, v)| v.as_vector()?.as_double_scalar())
2780        .unwrap_or(f64::EPSILON);
2781
2782    let doubles = x.to_doubles();
2783    let result: Vec<Option<String>> = doubles
2784        .iter()
2785        .map(|v| {
2786            v.map(|pv| {
2787                if pv.is_nan() {
2788                    "NaN".to_string()
2789                } else if pv < eps {
2790                    format!("< {:.e_digits$e}", eps, e_digits = digits.saturating_sub(1))
2791                } else if pv > 1.0 - eps {
2792                    // Near 1.0 — just show the formatted value
2793                    format!("{:.prec$}", pv, prec = digits)
2794                } else {
2795                    format_g(pv, digits, false)
2796                }
2797            })
2798        })
2799        .collect();
2800
2801    Ok(RValue::vec(Vector::Character(result.into())))
2802}
2803
2804// endregion
2805
2806// region: prettyNum
2807
2808/// Format numbers with separators for readability (e.g., thousand separators).
2809///
2810/// @param x character vector (or coerced from numeric) to format
2811/// @param big.mark character: separator inserted every 3 digits before the decimal
2812///   point (default "")
2813/// @param small.mark character: separator inserted every 3 digits after the decimal
2814///   point (default "")
2815/// @return character vector with separators inserted
2816#[builtin(name = "prettyNum", min_args = 1)]
2817fn builtin_pretty_num(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2818    let x = args.first().and_then(|v| v.as_vector()).ok_or_else(|| {
2819        RError::new(
2820            RErrorKind::Argument,
2821            "prettyNum() requires a vector as first argument".to_string(),
2822        )
2823    })?;
2824
2825    let big_mark = named
2826        .iter()
2827        .find(|(k, _)| k == "big.mark")
2828        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
2829        .unwrap_or_default();
2830
2831    let small_mark = named
2832        .iter()
2833        .find(|(k, _)| k == "small.mark")
2834        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
2835        .unwrap_or_default();
2836
2837    let chars = x.to_characters();
2838    let result: Vec<Option<String>> = chars
2839        .iter()
2840        .map(|v| v.as_ref().map(|s| insert_marks(s, &big_mark, &small_mark)))
2841        .collect();
2842
2843    Ok(RValue::vec(Vector::Character(result.into())))
2844}
2845
2846/// Insert `big_mark` every 3 digits before the decimal point and `small_mark`
2847/// every 3 digits after the decimal point.
2848fn insert_marks(s: &str, big_mark: &str, small_mark: &str) -> String {
2849    if big_mark.is_empty() && small_mark.is_empty() {
2850        return s.to_string();
2851    }
2852
2853    // Split into sign, integer part, and fractional part
2854    let (sign, rest) = if let Some(stripped) = s.strip_prefix('-') {
2855        ("-", stripped)
2856    } else if let Some(stripped) = s.strip_prefix('+') {
2857        ("+", stripped)
2858    } else {
2859        ("", s)
2860    };
2861
2862    // Trim leading/trailing whitespace from rest for the number portion
2863    let rest = rest.trim();
2864
2865    let (int_part, frac_part) = match rest.find('.') {
2866        Some(dot) => (&rest[..dot], Some(&rest[dot + 1..])),
2867        None => (rest, None),
2868    };
2869
2870    let mut out = String::with_capacity(s.len() + 10);
2871    out.push_str(sign);
2872
2873    // Insert big.mark in integer part (from right to left, every 3 digits)
2874    if !big_mark.is_empty() && int_part.len() > 3 {
2875        // Find where digits start (skip leading non-digit chars like spaces)
2876        let digit_start = int_part.find(|c: char| c.is_ascii_digit()).unwrap_or(0);
2877        out.push_str(&int_part[..digit_start]);
2878        let digits = &int_part[digit_start..];
2879        let len = digits.len();
2880        for (i, ch) in digits.chars().enumerate() {
2881            out.push(ch);
2882            let pos_from_right = len - 1 - i;
2883            if pos_from_right > 0 && pos_from_right % 3 == 0 {
2884                out.push_str(big_mark);
2885            }
2886        }
2887    } else {
2888        out.push_str(int_part);
2889    }
2890
2891    // Append fractional part with small.mark
2892    if let Some(frac) = frac_part {
2893        out.push('.');
2894        if !small_mark.is_empty() && frac.len() > 3 {
2895            for (i, ch) in frac.chars().enumerate() {
2896                out.push(ch);
2897                let pos = i + 1;
2898                if pos < frac.len() && pos % 3 == 0 {
2899                    out.push_str(small_mark);
2900                }
2901            }
2902        } else {
2903            out.push_str(frac);
2904        }
2905    }
2906
2907    out
2908}
2909
2910// endregion
2911
2912// region: encoding builtins
2913
2914/// Report the encoding of character strings.
2915///
2916/// Returns "unknown" for pure-ASCII strings, "UTF-8" for non-ASCII.
2917/// miniR is UTF-8 everywhere, so this simply checks for non-ASCII bytes.
2918///
2919/// @param x character vector
2920/// @return character vector of encoding names
2921#[builtin(name = "Encoding", min_args = 1)]
2922fn builtin_encoding(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
2923    let chars = match args.first() {
2924        Some(RValue::Vector(rv)) => rv.to_characters(),
2925        _ => {
2926            return Err(RError::new(
2927                RErrorKind::Argument,
2928                "argument is not a character vector".to_string(),
2929            ))
2930        }
2931    };
2932    let result: Vec<Option<String>> = chars
2933        .iter()
2934        .map(|s| match s {
2935            Some(s) => {
2936                if s.is_ascii() {
2937                    Some("unknown".to_string())
2938                } else {
2939                    Some("UTF-8".to_string())
2940                }
2941            }
2942            None => Some("unknown".to_string()),
2943        })
2944        .collect();
2945    Ok(RValue::vec(Vector::Character(result.into())))
2946}
2947
2948/// Convert character strings between encodings.
2949///
2950/// Supports conversions between UTF-8, Latin-1 (ISO-8859-1), ASCII, and "bytes".
2951/// Uses bstr for byte-level manipulation when dealing with non-UTF-8 data.
2952///
2953/// When `sub` is provided, it replaces characters that cannot be represented
2954/// in the target encoding. The special value `sub = "byte"` uses hex `<xx>`
2955/// escapes (matching R's behavior).
2956///
2957/// @param x character vector to convert
2958/// @param from source encoding name (default: "")
2959/// @param to target encoding name (default: "")
2960/// @param sub substitution string for unconvertible characters (default: NA)
2961/// @return character vector with converted strings
2962#[builtin(min_args = 1)]
2963fn builtin_iconv(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
2964    let chars = match args.first() {
2965        Some(RValue::Vector(rv)) => rv.to_characters(),
2966        _ => {
2967            return Err(RError::new(
2968                RErrorKind::Argument,
2969                "argument is not a character vector".to_string(),
2970            ))
2971        }
2972    };
2973
2974    let call_args = super::CallArgs::new(args, named);
2975    let from_enc = call_args
2976        .optional_string("from", 1)
2977        .unwrap_or_default()
2978        .to_uppercase();
2979    let to_enc = call_args
2980        .optional_string("to", 2)
2981        .unwrap_or_default()
2982        .to_uppercase();
2983    let sub = call_args.optional_string("sub", 3);
2984
2985    // Normalize encoding names
2986    let from_enc = normalize_encoding_name(&from_enc);
2987    let to_enc = normalize_encoding_name(&to_enc);
2988
2989    let result: Vec<Option<String>> = chars
2990        .iter()
2991        .map(|s| {
2992            s.as_ref()
2993                .map(|s| iconv_one(s, &from_enc, &to_enc, sub.as_deref()))
2994        })
2995        .collect();
2996
2997    Ok(RValue::vec(Vector::Character(result.into())))
2998}
2999
3000/// Normalize an encoding name to a canonical form.
3001fn normalize_encoding_name(name: &str) -> String {
3002    let upper = name.to_uppercase();
3003    match upper.as_str() {
3004        "" | "NATIVE" | "NATIVE.ENC" => "UTF-8".to_string(),
3005        "LATIN1" | "LATIN-1" | "ISO-8859-1" | "ISO8859-1" | "ISO88591" => "LATIN-1".to_string(),
3006        "UTF8" | "UTF-8" => "UTF-8".to_string(),
3007        "ASCII" | "US-ASCII" => "ASCII".to_string(),
3008        "BYTES" => "BYTES".to_string(),
3009        _ => upper,
3010    }
3011}
3012
3013/// Convert a single string between encodings using bstr for byte-level access.
3014fn iconv_one(s: &str, from: &str, to: &str, sub: Option<&str>) -> String {
3015    // If encodings are the same, return as-is
3016    if from == to {
3017        return s.to_string();
3018    }
3019
3020    // Since miniR strings are always valid UTF-8 internally, "from" is
3021    // effectively always UTF-8 regardless of what the user claims.
3022    // We convert FROM UTF-8 TO the target encoding, then back if needed.
3023    match to {
3024        "UTF-8" => {
3025            // Already UTF-8; nothing to do
3026            s.to_string()
3027        }
3028        "ASCII" => {
3029            // Replace non-ASCII with substitution or lossy replacement
3030            s.chars()
3031                .map(|c| {
3032                    if c.is_ascii() {
3033                        c.to_string()
3034                    } else {
3035                        match sub {
3036                            Some("byte") => {
3037                                // Hex escape each UTF-8 byte
3038                                let mut buf = [0u8; 4];
3039                                let bytes = c.encode_utf8(&mut buf).as_bytes();
3040                                bytes.iter().map(|b| format!("<{b:02x}>")).collect()
3041                            }
3042                            Some(replacement) => replacement.to_string(),
3043                            None => String::new(), // R returns NA for unconvertible without sub, but we use empty
3044                        }
3045                    }
3046                })
3047                .collect()
3048        }
3049        "LATIN-1" => {
3050            // Convert UTF-8 to Latin-1: chars in 0..=255 map directly,
3051            // others need substitution
3052            s.chars()
3053                .map(|c| {
3054                    if u32::from(c) <= 255 {
3055                        c.to_string()
3056                    } else {
3057                        match sub {
3058                            Some("byte") => {
3059                                let mut buf = [0u8; 4];
3060                                let bytes = c.encode_utf8(&mut buf).as_bytes();
3061                                bytes.iter().map(|b| format!("<{b:02x}>")).collect()
3062                            }
3063                            Some(replacement) => replacement.to_string(),
3064                            None => String::new(),
3065                        }
3066                    }
3067                })
3068                .collect()
3069        }
3070        "BYTES" => {
3071            // Convert to hex representation of UTF-8 bytes
3072            s.as_bytes().iter().map(|b| format!("\\x{b:02x}")).collect()
3073        }
3074        _ => {
3075            // Unsupported target encoding — return with warning-like behavior
3076            // In R this would produce NA with a warning; we do lossy passthrough
3077            s.to_string()
3078        }
3079    }
3080}
3081
3082/// Convert character vector to UTF-8 encoding (passthrough in miniR).
3083///
3084/// Since miniR uses UTF-8 everywhere, this is a no-op that returns its input.
3085///
3086/// @param x character vector
3087/// @return character vector (unchanged)
3088#[builtin(min_args = 1)]
3089fn builtin_enc2utf8(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
3090    match args.first() {
3091        Some(v @ RValue::Vector(_)) => Ok(v.clone()),
3092        _ => Err(RError::new(
3093            RErrorKind::Argument,
3094            "argument is not a character vector".to_string(),
3095        )),
3096    }
3097}
3098
3099/// Convert character vector to native encoding (passthrough in miniR).
3100///
3101/// Since miniR uses UTF-8 everywhere, this is a no-op that returns its input.
3102///
3103/// @param x character vector
3104/// @return character vector (unchanged)
3105#[builtin(min_args = 1)]
3106fn builtin_enc2native(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
3107    match args.first() {
3108        Some(v @ RValue::Vector(_)) => Ok(v.clone()),
3109        _ => Err(RError::new(
3110            RErrorKind::Argument,
3111            "argument is not a character vector".to_string(),
3112        )),
3113    }
3114}
3115
3116// endregion
3117
3118// region: strtrim
3119
3120/// Trim character strings to a specified display width.
3121///
3122/// Truncates each string to at most `width` display columns. Multi-byte
3123/// characters and wide characters (CJK) are measured by their terminal width.
3124///
3125/// @param x character vector
3126/// @param width integer vector of maximum widths (recycled)
3127/// @return character vector of trimmed strings
3128#[builtin(min_args = 2)]
3129fn builtin_strtrim(args: &[RValue], _: &[(String, RValue)]) -> Result<RValue, RError> {
3130    let chars = match args.first() {
3131        Some(RValue::Vector(rv)) => rv.to_characters(),
3132        _ => {
3133            return Err(RError::new(
3134                RErrorKind::Argument,
3135                "non-character argument".to_string(),
3136            ))
3137        }
3138    };
3139    let widths = match args.get(1) {
3140        Some(RValue::Vector(rv)) => rv.to_doubles(),
3141        _ => {
3142            return Err(RError::new(
3143                RErrorKind::Argument,
3144                "'width' must be numeric".to_string(),
3145            ))
3146        }
3147    };
3148    if widths.is_empty() {
3149        return Err(RError::new(
3150            RErrorKind::Argument,
3151            "invalid 'width' argument — must be a positive number".to_string(),
3152        ));
3153    }
3154
3155    let result: Vec<Option<String>> = chars
3156        .iter()
3157        .enumerate()
3158        .map(|(i, s)| {
3159            let w = widths[i % widths.len()];
3160            match (s, w) {
3161                (Some(s), Some(w)) => {
3162                    let max_w = w.max(0.0) as usize;
3163                    Some(trim_to_width(s, max_w))
3164                }
3165                (None, _) => None,
3166                (_, None) => None,
3167            }
3168        })
3169        .collect();
3170    Ok(RValue::vec(Vector::Character(result.into())))
3171}
3172
3173/// Trim a string to at most `max_width` display columns.
3174fn trim_to_width(s: &str, max_width: usize) -> String {
3175    use unicode_width::UnicodeWidthChar;
3176    let mut result = String::new();
3177    let mut current_width = 0;
3178    for ch in s.chars() {
3179        let ch_width = UnicodeWidthChar::width(ch).unwrap_or(0);
3180        if current_width + ch_width > max_width {
3181            break;
3182        }
3183        result.push(ch);
3184        current_width += ch_width;
3185    }
3186    result
3187}
3188
3189// endregion
3190
3191// region: URLencode, URLdecode, casefold, encodeString, substr<-
3192
3193/// Percent-encode URL strings per RFC 3986.
3194///
3195/// Vectorized over URL.
3196///
3197/// @param URL character vector of strings to encode
3198/// @param reserved if TRUE (default), also encode reserved characters
3199/// @return character vector of percent-encoded strings
3200#[builtin(name = "URLencode", min_args = 1, namespace = "utils")]
3201fn builtin_urlencode(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
3202    let url_vec = args
3203        .first()
3204        .and_then(|v| v.as_vector())
3205        .map(|v| v.to_characters())
3206        .unwrap_or_default();
3207    let reserved = named
3208        .iter()
3209        .find(|(k, _)| k == "reserved")
3210        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
3211        .unwrap_or(true);
3212
3213    let unreserved = |b: u8| -> bool {
3214        b.is_ascii_alphanumeric() || b == b'-' || b == b'_' || b == b'.' || b == b'~'
3215    };
3216    let is_reserved = |b: u8| -> bool {
3217        matches!(
3218            b,
3219            b':' | b'/'
3220                | b'?'
3221                | b'#'
3222                | b'['
3223                | b']'
3224                | b'@'
3225                | b'!'
3226                | b'$'
3227                | b'&'
3228                | b'\''
3229                | b'('
3230                | b')'
3231                | b'*'
3232                | b'+'
3233                | b','
3234                | b';'
3235                | b'='
3236        )
3237    };
3238
3239    let result: Vec<Option<String>> = url_vec
3240        .into_iter()
3241        .map(|s_opt| {
3242            s_opt.map(|s| {
3243                let mut encoded = String::new();
3244                for &b in s.as_bytes() {
3245                    if unreserved(b) || (!reserved && is_reserved(b)) {
3246                        encoded.push(char::from(b));
3247                    } else {
3248                        encoded.push_str(&format!("%{:02X}", b));
3249                    }
3250                }
3251                encoded
3252            })
3253        })
3254        .collect();
3255    Ok(RValue::vec(Vector::Character(result.into())))
3256}
3257
3258/// Decode percent-encoded URL strings.
3259///
3260/// Vectorized over URL.
3261///
3262/// @param URL character vector of percent-encoded strings
3263/// @return character vector of decoded strings
3264#[builtin(name = "URLdecode", min_args = 1, namespace = "utils")]
3265fn builtin_urldecode(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3266    let url_vec = args
3267        .first()
3268        .and_then(|v| v.as_vector())
3269        .map(|v| v.to_characters())
3270        .unwrap_or_default();
3271
3272    let result: Vec<Option<String>> = url_vec
3273        .into_iter()
3274        .map(|s_opt| {
3275            s_opt.map(|s| {
3276                let mut bytes = Vec::new();
3277                let mut chars = s.bytes();
3278                while let Some(b) = chars.next() {
3279                    if b == b'%' {
3280                        let hi = chars.next().unwrap_or(b'0');
3281                        let lo = chars.next().unwrap_or(b'0');
3282                        let hex = [hi, lo];
3283                        if let Ok(val) =
3284                            u8::from_str_radix(std::str::from_utf8(&hex).unwrap_or("00"), 16)
3285                        {
3286                            bytes.push(val);
3287                        }
3288                    } else if b == b'+' {
3289                        bytes.push(b' ');
3290                    } else {
3291                        bytes.push(b);
3292                    }
3293                }
3294                String::from_utf8_lossy(&bytes).into_owned()
3295            })
3296        })
3297        .collect();
3298    Ok(RValue::vec(Vector::Character(result.into())))
3299}
3300
3301/// Convert case of a character vector.
3302///
3303/// @param x character vector
3304/// @param upper if TRUE, convert to uppercase; if FALSE (default), to lowercase
3305/// @return character vector with converted case
3306#[builtin(min_args = 1)]
3307fn builtin_casefold(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
3308    let upper = named
3309        .iter()
3310        .find(|(k, _)| k == "upper")
3311        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
3312        .unwrap_or(false);
3313
3314    match args.first() {
3315        Some(RValue::Vector(rv)) => {
3316            let result: Vec<Option<String>> = rv
3317                .to_characters()
3318                .into_iter()
3319                .map(|opt| {
3320                    opt.map(|s| {
3321                        if upper {
3322                            s.to_uppercase()
3323                        } else {
3324                            s.to_lowercase()
3325                        }
3326                    })
3327                })
3328                .collect();
3329            Ok(RValue::vec(Vector::Character(result.into())))
3330        }
3331        _ => Ok(RValue::Null),
3332    }
3333}
3334
3335/// Encode a character string with optional quoting, width, and justification.
3336///
3337/// @param x character vector
3338/// @param width integer: minimum field width (default NA, meaning no padding;
3339///   0 means pad to the widest element)
3340/// @param quote character scalar: quote character to wrap each string in (default "")
3341/// @param na.encode logical: if TRUE (default), encode NA as "NA"
3342/// @param justify character scalar: "left", "right", "centre"/"center", or "none"
3343///   (default "left"; "none" when width is NA)
3344/// @return character vector with encoded strings
3345#[builtin(name = "encodeString", min_args = 1)]
3346fn builtin_encode_string(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
3347    let width: Option<usize> = named
3348        .iter()
3349        .find(|(k, _)| k == "width")
3350        .and_then(|(_, v)| v.as_vector()?.as_integer_scalar())
3351        .and_then(|i| usize::try_from(i).ok())
3352        .or_else(|| {
3353            args.get(1)
3354                .and_then(|v| v.as_vector()?.as_integer_scalar())
3355                .and_then(|i| usize::try_from(i).ok())
3356        });
3357
3358    let quote = named
3359        .iter()
3360        .find(|(k, _)| k == "quote")
3361        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
3362        .or_else(|| {
3363            args.get(2)
3364                .and_then(|v| v.as_vector()?.as_character_scalar())
3365        })
3366        .unwrap_or_default();
3367
3368    let na_encode = named
3369        .iter()
3370        .find(|(k, _)| k == "na.encode")
3371        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
3372        .unwrap_or(true);
3373
3374    let justify = named
3375        .iter()
3376        .find(|(k, _)| k == "justify")
3377        .and_then(|(_, v)| v.as_vector()?.as_character_scalar())
3378        .or_else(|| {
3379            args.get(3)
3380                .and_then(|v| v.as_vector()?.as_character_scalar())
3381        })
3382        .unwrap_or_else(|| {
3383            if width.is_some() {
3384                "left".to_string()
3385            } else {
3386                "none".to_string()
3387            }
3388        });
3389
3390    match args.first() {
3391        Some(RValue::Vector(rv)) => {
3392            // First pass: escape and quote all strings
3393            let chars = rv.to_characters();
3394            let mut encoded: Vec<Option<String>> = chars
3395                .into_iter()
3396                .map(|opt| match opt {
3397                    Some(s) => {
3398                        let escaped = s
3399                            .replace('\\', "\\\\")
3400                            .replace('\n', "\\n")
3401                            .replace('\r', "\\r")
3402                            .replace('\t', "\\t");
3403                        let escaped = if !quote.is_empty() {
3404                            escaped.replace(&quote, &format!("\\{quote}"))
3405                        } else {
3406                            escaped
3407                        };
3408                        if quote.is_empty() {
3409                            Some(escaped)
3410                        } else {
3411                            Some(format!("{quote}{escaped}{quote}"))
3412                        }
3413                    }
3414                    None => {
3415                        if na_encode {
3416                            Some("NA".to_string())
3417                        } else {
3418                            None
3419                        }
3420                    }
3421                })
3422                .collect();
3423
3424            // Second pass: apply width padding if requested
3425            if let Some(w) = width {
3426                // If width == 0, use the max display width of all encoded strings
3427                let effective_width = if w == 0 {
3428                    encoded
3429                        .iter()
3430                        .filter_map(|s| s.as_ref())
3431                        .map(|s| UnicodeWidthStr::width(s.as_str()))
3432                        .max()
3433                        .unwrap_or(0)
3434                } else {
3435                    w
3436                };
3437
3438                if effective_width > 0 && justify != "none" {
3439                    encoded = encoded
3440                        .into_iter()
3441                        .map(|opt| {
3442                            opt.map(|s| {
3443                                let display_w = UnicodeWidthStr::width(s.as_str());
3444                                if display_w >= effective_width {
3445                                    s
3446                                } else {
3447                                    let pad = effective_width - display_w;
3448                                    match justify.as_str() {
3449                                        "left" => format!("{}{}", s, " ".repeat(pad)),
3450                                        "right" => format!("{}{}", " ".repeat(pad), s),
3451                                        "centre" | "center" => {
3452                                            let left_pad = pad / 2;
3453                                            let right_pad = pad - left_pad;
3454                                            format!(
3455                                                "{}{}{}",
3456                                                " ".repeat(left_pad),
3457                                                s,
3458                                                " ".repeat(right_pad)
3459                                            )
3460                                        }
3461                                        _ => s, // "none" or unknown
3462                                    }
3463                                }
3464                            })
3465                        })
3466                        .collect();
3467                }
3468            }
3469
3470            Ok(RValue::vec(Vector::Character(encoded.into())))
3471        }
3472        _ => Ok(RValue::Null),
3473    }
3474}
3475
3476/// Replace substrings in character strings.
3477///
3478/// Vectorized over x, with start, stop, and value recycled.
3479///
3480/// @param x character vector (modified in place conceptually)
3481/// @param start integer vector of start positions (1-based)
3482/// @param stop integer vector of stop positions (1-based)
3483/// @param value character vector of replacement strings
3484/// @return character vector with substrings replaced
3485#[builtin(name = "substr<-", min_args = 4)]
3486fn builtin_substr_assign(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3487    let x_vec = args
3488        .first()
3489        .and_then(|v| v.as_vector())
3490        .map(|v| v.to_characters())
3491        .unwrap_or_default();
3492    let start_vec = args
3493        .get(1)
3494        .and_then(|v| v.as_vector())
3495        .map(|v| v.to_integers())
3496        .unwrap_or_else(|| vec![Some(1)]);
3497    let stop_vec = args
3498        .get(2)
3499        .and_then(|v| v.as_vector())
3500        .map(|v| v.to_integers())
3501        .unwrap_or_else(|| vec![Some(1)]);
3502    let value_vec = args
3503        .get(3)
3504        .and_then(|v| v.as_vector())
3505        .map(|v| v.to_characters())
3506        .unwrap_or_default();
3507
3508    if x_vec.is_empty() {
3509        return Ok(RValue::vec(Vector::Character(vec![].into())));
3510    }
3511
3512    let n = x_vec.len();
3513    let result: Vec<Option<String>> = (0..n)
3514        .map(|i| {
3515            let x_opt = &x_vec[i];
3516            let start_opt = if start_vec.is_empty() {
3517                Some(1)
3518            } else {
3519                start_vec[i % start_vec.len()]
3520            };
3521            let stop_opt = if stop_vec.is_empty() {
3522                Some(1)
3523            } else {
3524                stop_vec[i % stop_vec.len()]
3525            };
3526            let value_opt = if value_vec.is_empty() {
3527                None
3528            } else {
3529                value_vec[i % value_vec.len()].clone()
3530            };
3531
3532            match (x_opt, start_opt, stop_opt, value_opt) {
3533                (Some(x), Some(start_i), Some(stop_i), Some(value)) => {
3534                    let start = usize::try_from(start_i).unwrap_or(0);
3535                    let stop = usize::try_from(stop_i).unwrap_or(0);
3536                    let chars: Vec<char> = x.chars().collect();
3537                    let start = start.saturating_sub(1).min(chars.len());
3538                    let stop = stop.min(chars.len());
3539                    let range_len = stop.saturating_sub(start);
3540                    let repl_chars: Vec<char> = value.chars().take(range_len).collect();
3541
3542                    let mut result: Vec<char> = chars[..start].to_vec();
3543                    result.extend(&repl_chars);
3544                    if repl_chars.len() < range_len {
3545                        result.extend(&chars[start + repl_chars.len()..stop]);
3546                    }
3547                    result.extend(&chars[stop..]);
3548
3549                    Some(result.into_iter().collect())
3550                }
3551                (None, _, _, _) | (_, None, _, _) | (_, _, None, _) | (_, _, _, None) => None,
3552            }
3553        })
3554        .collect();
3555
3556    Ok(RValue::vec(Vector::Character(result.into())))
3557}
3558
3559// endregion
3560
3561// region: strwrap
3562
3563/// Wrap character strings to a specified width.
3564///
3565/// Breaks long strings into lines of at most `width` characters. Supports
3566/// first-line indentation (`indent`) and subsequent-line indentation (`exdent`).
3567///
3568/// @param x character vector to wrap
3569/// @param width target line width (default: 0.9 * getOption("width"))
3570/// @param indent indentation of first line (default: 0)
3571/// @param exdent indentation of subsequent lines (default: 0)
3572/// @param prefix prefix for each line (default: "")
3573/// @param simplify if TRUE return a character vector, if FALSE return a list
3574/// @return character vector (or list) of wrapped strings
3575/// @namespace base
3576#[builtin(min_args = 1)]
3577fn builtin_strwrap(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
3578    let call_args = CallArgs::new(args, named);
3579
3580    let x = match call_args.value("x", 0) {
3581        Some(v) => match v.as_vector() {
3582            Some(v) => v.to_characters(),
3583            None => {
3584                return Err(RError::new(
3585                    RErrorKind::Argument,
3586                    "invalid 'x' argument".to_string(),
3587                ))
3588            }
3589        },
3590        None => {
3591            return Err(RError::new(
3592                RErrorKind::Argument,
3593                "argument 'x' is missing".to_string(),
3594            ))
3595        }
3596    };
3597
3598    let width = call_args.integer_or("width", 1, 80) as usize;
3599    let indent = call_args.integer_or("indent", 2, 0) as usize;
3600    let exdent = call_args.integer_or("exdent", 3, 0) as usize;
3601    let prefix = call_args.optional_string("prefix", 4).unwrap_or_default();
3602    let initial = call_args
3603        .optional_string("initial", 5)
3604        .unwrap_or_else(|| prefix.clone());
3605    let simplify = call_args.logical_flag("simplify", 6, true);
3606
3607    let indent_str = " ".repeat(indent);
3608    let exdent_str = " ".repeat(exdent);
3609
3610    let mut all_lines: Vec<Vec<Option<String>>> = Vec::new();
3611    for s_opt in &x {
3612        match s_opt {
3613            None => all_lines.push(vec![None]),
3614            Some(s) => {
3615                let effective_width = width.saturating_sub(initial.len() + indent);
3616                if effective_width == 0 {
3617                    all_lines.push(vec![Some(format!("{initial}{indent_str}{s}"))]);
3618                    continue;
3619                }
3620                let wrapped = textwrap::wrap(s, effective_width);
3621                let mut lines = Vec::new();
3622                for (i, line) in wrapped.iter().enumerate() {
3623                    if i == 0 {
3624                        lines.push(Some(format!("{initial}{indent_str}{line}")));
3625                    } else {
3626                        let ew = width.saturating_sub(prefix.len() + exdent);
3627                        let rewrapped = if ew > 0 && line.len() > ew {
3628                            textwrap::wrap(line, ew)
3629                        } else {
3630                            vec![std::borrow::Cow::Borrowed(line.as_ref())]
3631                        };
3632                        for subline in rewrapped {
3633                            lines.push(Some(format!("{prefix}{exdent_str}{subline}")));
3634                        }
3635                    }
3636                }
3637                if lines.is_empty() {
3638                    lines.push(Some(format!("{initial}{indent_str}")));
3639                }
3640                all_lines.push(lines);
3641            }
3642        }
3643    }
3644
3645    if simplify {
3646        let flat: Vec<Option<String>> = all_lines.into_iter().flatten().collect();
3647        Ok(RValue::vec(Vector::Character(flat.into())))
3648    } else {
3649        let list_vals: Vec<(Option<String>, RValue)> = all_lines
3650            .into_iter()
3651            .map(|lines| (None, RValue::vec(Vector::Character(lines.into()))))
3652            .collect();
3653        Ok(RValue::List(RList::new(list_vals)))
3654    }
3655}
3656
3657// endregion
3658
3659// region: CRAN-compat string builtins (toString, gettext, type.convert, locale)
3660
3661/// Collapse a vector into a single comma-separated string.
3662///
3663/// @param x vector to collapse
3664/// @param sep separator (default ", ")
3665/// @return character scalar
3666/// @namespace base
3667#[builtin(name = "toString", min_args = 1)]
3668fn builtin_to_string(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
3669    let call_args = CallArgs::new(args, named);
3670    let sep = call_args
3671        .optional_string("sep", 1)
3672        .unwrap_or_else(|| ", ".to_string());
3673
3674    let chars = match args.first() {
3675        Some(RValue::Vector(rv)) => rv.to_characters(),
3676        Some(RValue::Null) => Vec::new(),
3677        Some(RValue::List(l)) => l
3678            .values
3679            .iter()
3680            .map(|(_, v)| v.as_vector().and_then(|vec| vec.as_character_scalar()))
3681            .collect(),
3682        _ => vec![],
3683    };
3684
3685    let parts: Vec<String> = chars.into_iter().flatten().collect();
3686    Ok(RValue::vec(Vector::Character(
3687        vec![Some(parts.join(&sep))].into(),
3688    )))
3689}
3690
3691/// Translate a message (i18n stub — returns the message unchanged).
3692///
3693/// @param ... character strings to return
3694/// @param domain translation domain (ignored)
3695/// @return character vector
3696/// @namespace base
3697#[builtin(min_args = 1)]
3698fn builtin_gettext(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3699    // Stub: just return the first argument unchanged
3700    match args.first() {
3701        Some(v) => Ok(v.clone()),
3702        None => Ok(RValue::vec(Vector::Character(
3703            vec![Some(String::new())].into(),
3704        ))),
3705    }
3706}
3707
3708/// Translate a message with singular/plural forms (i18n stub).
3709///
3710/// @param n count for plural selection
3711/// @param msg1 singular message
3712/// @param msg2 plural message
3713/// @return character scalar
3714/// @namespace base
3715#[builtin(min_args = 3)]
3716fn builtin_ngettext(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3717    let n = args
3718        .first()
3719        .and_then(|v| v.as_vector()?.as_integer_scalar())
3720        .unwrap_or(1);
3721    let msg = if n == 1 { args.get(1) } else { args.get(2) };
3722    match msg {
3723        Some(v) => Ok(v.clone()),
3724        None => Ok(RValue::vec(Vector::Character(
3725            vec![Some(String::new())].into(),
3726        ))),
3727    }
3728}
3729
3730/// Format and translate a message (i18n stub — delegates to sprintf).
3731///
3732/// @param fmt format string
3733/// @param ... format arguments
3734/// @return character scalar
3735/// @namespace base
3736#[builtin(min_args = 1)]
3737fn builtin_gettextf(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
3738    // Delegate to sprintf
3739    builtin_sprintf(args, named)
3740}
3741
3742/// Auto-convert character vector to appropriate type.
3743///
3744/// @param x character vector to convert
3745/// @param as.is if TRUE, don't convert to factor (default TRUE in miniR)
3746/// @return converted vector (numeric, integer, logical, or character)
3747/// @namespace utils
3748#[builtin(name = "type.convert", min_args = 1)]
3749fn builtin_type_convert(args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3750    let chars = match args.first() {
3751        Some(RValue::Vector(rv)) => rv.to_characters(),
3752        _ => return Ok(args.first().cloned().unwrap_or(RValue::Null)),
3753    };
3754
3755    // Try logical first
3756    let all_logical = chars
3757        .iter()
3758        .all(|c| matches!(c.as_deref(), Some("TRUE" | "FALSE" | "T" | "F") | None));
3759    if all_logical && !chars.is_empty() {
3760        let vals: Vec<Option<bool>> = chars
3761            .iter()
3762            .map(|c| match c.as_deref() {
3763                Some("TRUE") | Some("T") => Some(true),
3764                Some("FALSE") | Some("F") => Some(false),
3765                _ => None,
3766            })
3767            .collect();
3768        return Ok(RValue::vec(Vector::Logical(vals.into())));
3769    }
3770
3771    // Try integer
3772    let all_int = chars.iter().all(|c| match c {
3773        None => true,
3774        Some(s) => s.parse::<i64>().is_ok(),
3775    });
3776    if all_int && !chars.is_empty() {
3777        let vals: Vec<Option<i64>> = chars
3778            .iter()
3779            .map(|c| match c {
3780                None => None,
3781                Some(s) => s.parse::<i64>().ok(),
3782            })
3783            .collect();
3784        return Ok(RValue::vec(Vector::Integer(vals.into())));
3785    }
3786
3787    // Try double
3788    let all_double = chars.iter().all(|c| match c {
3789        None => true,
3790        Some(s) => s.parse::<f64>().is_ok() || s == "NA" || s == "NaN" || s == "Inf" || s == "-Inf",
3791    });
3792    if all_double && !chars.is_empty() {
3793        let vals: Vec<Option<f64>> = chars
3794            .iter()
3795            .map(|c| match c {
3796                None => None,
3797                Some(s) => match s.as_str() {
3798                    "NA" => None,
3799                    "NaN" => Some(f64::NAN),
3800                    "Inf" => Some(f64::INFINITY),
3801                    "-Inf" => Some(f64::NEG_INFINITY),
3802                    _ => s.parse::<f64>().ok(),
3803                },
3804            })
3805            .collect();
3806        return Ok(RValue::vec(Vector::Double(vals.into())));
3807    }
3808
3809    // Keep as character
3810    Ok(args.first().cloned().unwrap_or(RValue::Null))
3811}
3812
3813/// Get the current locale setting (stub — returns "C" locale).
3814///
3815/// @param category locale category (default "LC_ALL")
3816/// @return character scalar with locale name
3817/// @namespace base
3818#[builtin(name = "Sys.getlocale", min_args = 0)]
3819fn builtin_sys_getlocale(_args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3820    Ok(RValue::vec(Vector::Character(
3821        vec![Some("C".to_string())].into(),
3822    )))
3823}
3824
3825/// Set the locale (stub — accepts but ignores the setting).
3826///
3827/// @param category locale category
3828/// @param locale locale string
3829/// @return character scalar with previous locale
3830/// @namespace base
3831#[builtin(name = "Sys.setlocale", min_args = 0)]
3832fn builtin_sys_setlocale(_args: &[RValue], _named: &[(String, RValue)]) -> Result<RValue, RError> {
3833    Ok(RValue::vec(Vector::Character(
3834        vec![Some("C".to_string())].into(),
3835    )))
3836}
3837
3838// endregion