Skip to main content

r/interpreter/builtins/
dataframes.rs

1//! Data frame manipulation builtins — merge, subset, transform, with.
2//!
3//! These builtins operate on data frames (lists with class "data.frame").
4//! `subset`, `transform`, and `with` are pre-eval builtins because they need
5//! to evaluate expressions in the context of a data frame's columns.
6//! `split()` is in `interp.rs` since it already existed there.
7
8use crate::interpreter::environment::Environment;
9use crate::interpreter::value::*;
10use crate::interpreter::BuiltinContext;
11use crate::parser::ast::{Arg, Expr};
12use minir_macros::{builtin, pre_eval_builtin};
13
14use super::has_class;
15use super::pre_eval::recycle_value;
16
17// region: helpers
18
19/// Extract a data frame's column names from its "names" attribute.
20pub(super) fn df_col_names(list: &RList) -> Vec<Option<String>> {
21    match list.get_attr("names") {
22        Some(RValue::Vector(rv)) => rv.inner.to_characters(),
23        _ => list.values.iter().map(|(name, _)| name.clone()).collect(),
24    }
25}
26
27/// Get the number of rows in a data frame.
28pub(super) fn df_nrow(list: &RList) -> usize {
29    list.get_attr("row.names")
30        .map(RValue::length)
31        .unwrap_or_else(|| {
32            list.values
33                .iter()
34                .map(|(_, value)| value.length())
35                .max()
36                .unwrap_or(0)
37        })
38}
39
40/// Find a column by name in a data frame, returning its index.
41pub(super) fn df_col_index(list: &RList, name: &str) -> Option<usize> {
42    let names = df_col_names(list);
43    names.iter().position(|n| n.as_deref() == Some(name))
44}
45
46/// Subset a vector by a boolean mask (true = keep, false/NA = drop).
47fn subset_vector_by_mask(vec: &Vector, mask: &[Option<bool>]) -> Vector {
48    let indices: Vec<usize> = mask
49        .iter()
50        .enumerate()
51        .filter(|(_, keep)| **keep == Some(true))
52        .map(|(i, _)| i)
53        .collect();
54    vec.select_indices(&indices)
55}
56
57/// Subset an RValue column by a boolean mask.
58fn subset_rvalue_by_mask(val: &RValue, mask: &[Option<bool>]) -> RValue {
59    match val {
60        RValue::Vector(rv) => {
61            let new_inner = subset_vector_by_mask(&rv.inner, mask);
62            let mut new_rv = RVector::from(new_inner);
63            // Copy non-structural attrs (like class, levels for factors)
64            if let Some(attrs) = &rv.attrs {
65                for (k, v) in attrs.iter() {
66                    if k != "names" && k != "dim" && k != "dimnames" {
67                        new_rv.set_attr(k.clone(), v.clone());
68                    }
69                }
70            }
71            // Subset names if present
72            if let Some(names_attr) = rv.get_attr("names") {
73                if let Some(names_vec) = names_attr.as_vector() {
74                    let new_names = subset_vector_by_mask(names_vec, mask);
75                    new_rv.set_attr("names".to_string(), RValue::vec(new_names));
76                }
77            }
78            RValue::Vector(new_rv)
79        }
80        other => other.clone(),
81    }
82}
83
84/// Build a data frame from columns with automatic row names.
85pub(super) fn build_data_frame(
86    columns: Vec<(Option<String>, RValue)>,
87    nrow: usize,
88) -> Result<RValue, RError> {
89    let names: Vec<Option<String>> = columns.iter().map(|(name, _)| name.clone()).collect();
90    let mut list = RList::new(columns);
91    list.set_attr(
92        "class".to_string(),
93        RValue::vec(Vector::Character(
94            vec![Some("data.frame".to_string())].into(),
95        )),
96    );
97    list.set_attr(
98        "names".to_string(),
99        RValue::vec(Vector::Character(names.into())),
100    );
101    list.set_attr(
102        "row.names".to_string(),
103        RValue::vec(Vector::Integer(
104            (1..=i64::try_from(nrow)?)
105                .map(Some)
106                .collect::<Vec<_>>()
107                .into(),
108        )),
109    );
110    Ok(RValue::List(list))
111}
112
113/// Create a child environment with data frame columns as bindings.
114fn df_to_env(list: &RList, parent: &Environment) -> Environment {
115    let child = Environment::new_child(parent);
116    let names = df_col_names(list);
117    for (i, (_, val)) in list.values.iter().enumerate() {
118        if let Some(Some(name)) = names.get(i) {
119            child.set(name.clone(), val.clone());
120        }
121    }
122    child
123}
124
125/// Coerce an RValue to a character vector of column names.
126fn coerce_to_string_vec(val: &RValue) -> Option<Vec<String>> {
127    match val {
128        RValue::Vector(rv) => {
129            let chars = rv.inner.to_characters();
130            let result: Vec<String> = chars.into_iter().flatten().collect();
131            if result.is_empty() {
132                None
133            } else {
134                Some(result)
135            }
136        }
137        _ => None,
138    }
139}
140
141// endregion
142
143// region: merge
144
145/// Merge two data frames by common columns (SQL-style join).
146///
147/// Performs an inner join by default. Use `all`, `all.x`, or `all.y` for
148/// outer joins. The `by` argument specifies join columns; if omitted, uses
149/// columns with names common to both data frames.
150///
151/// @param x first data frame
152/// @param y second data frame
153/// @param by character vector of shared column names to join on
154/// @param by.x character vector of column names in x to join on
155/// @param by.y character vector of column names in y to join on
156/// @param all logical; if TRUE, do a full outer join
157/// @param all.x logical; if TRUE, keep all rows from x (left join)
158/// @param all.y logical; if TRUE, keep all rows from y (right join)
159/// @return merged data frame
160#[builtin(min_args = 2)]
161fn builtin_merge(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
162    let (x_list, y_list) = match (&args[0], &args[1]) {
163        (x @ RValue::List(xl), y @ RValue::List(yl))
164            if has_class(x, "data.frame") && has_class(y, "data.frame") =>
165        {
166            (xl, yl)
167        }
168        _ => {
169            return Err(RError::new(
170                RErrorKind::Type,
171                "merge() requires two data frames".to_string(),
172            ))
173        }
174    };
175
176    // Parse named arguments
177    let by = named
178        .iter()
179        .find(|(k, _)| k == "by")
180        .map(|(_, v)| v)
181        .or(args.get(2))
182        .and_then(coerce_to_string_vec);
183    let by_x = named
184        .iter()
185        .find(|(k, _)| k == "by.x")
186        .map(|(_, v)| v)
187        .and_then(coerce_to_string_vec);
188    let by_y = named
189        .iter()
190        .find(|(k, _)| k == "by.y")
191        .map(|(_, v)| v)
192        .and_then(coerce_to_string_vec);
193    let all = named
194        .iter()
195        .find(|(k, _)| k == "all")
196        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
197        .unwrap_or(false);
198    let all_x = named
199        .iter()
200        .find(|(k, _)| k == "all.x")
201        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
202        .unwrap_or(all);
203    let all_y = named
204        .iter()
205        .find(|(k, _)| k == "all.y")
206        .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
207        .unwrap_or(all);
208
209    let x_names = df_col_names(x_list);
210    let y_names = df_col_names(y_list);
211
212    // Determine join columns
213    let (join_x, join_y) = match (by, by_x, by_y) {
214        (_, Some(bx), Some(by)) => (bx, by),
215        (Some(b), _, _) => (b.clone(), b),
216        (None, _, _) => {
217            // Auto-detect common columns
218            let x_set: Vec<&str> = x_names.iter().filter_map(|n| n.as_deref()).collect();
219            let common: Vec<String> = y_names
220                .iter()
221                .filter_map(|n| n.as_deref())
222                .filter(|n| x_set.contains(n))
223                .map(|n| n.to_string())
224                .collect();
225            if common.is_empty() {
226                return Err(RError::new(
227                    RErrorKind::Argument,
228                    "no common columns to merge on — specify 'by', 'by.x', or 'by.y'".to_string(),
229                ));
230            }
231            (common.clone(), common)
232        }
233    };
234
235    if join_x.len() != join_y.len() {
236        return Err(RError::new(
237            RErrorKind::Argument,
238            "by.x and by.y must have the same length".to_string(),
239        ));
240    }
241
242    // Validate join columns exist
243    let x_join_indices: Vec<usize> = join_x
244        .iter()
245        .map(|name| {
246            df_col_index(x_list, name).ok_or_else(|| {
247                RError::new(
248                    RErrorKind::Argument,
249                    format!("column '{}' not found in x", name),
250                )
251            })
252        })
253        .collect::<Result<_, _>>()?;
254    let y_join_indices: Vec<usize> = join_y
255        .iter()
256        .map(|name| {
257            df_col_index(y_list, name).ok_or_else(|| {
258                RError::new(
259                    RErrorKind::Argument,
260                    format!("column '{}' not found in y", name),
261                )
262            })
263        })
264        .collect::<Result<_, _>>()?;
265
266    let x_nrow = df_nrow(x_list);
267    let y_nrow = df_nrow(y_list);
268
269    // Extract join key values as character vectors for comparison
270    fn extract_keys(list: &RList, indices: &[usize], nrow: usize) -> Vec<Vec<Option<String>>> {
271        (0..nrow)
272            .map(|row| {
273                indices
274                    .iter()
275                    .map(|&col_idx| {
276                        let (_, val) = &list.values[col_idx];
277                        match val {
278                            RValue::Vector(rv) => {
279                                let chars = rv.inner.to_characters();
280                                chars.get(row).cloned().flatten()
281                            }
282                            _ => None,
283                        }
284                    })
285                    .collect()
286            })
287            .collect()
288    }
289
290    let x_keys = extract_keys(x_list, &x_join_indices, x_nrow);
291    let y_keys = extract_keys(y_list, &y_join_indices, y_nrow);
292
293    // Build match pairs: (x_row, y_row)
294    let mut match_pairs: Vec<(Option<usize>, Option<usize>)> = Vec::new();
295    let mut y_matched = vec![false; y_nrow];
296
297    for (xi, x_key) in x_keys.iter().enumerate() {
298        let mut found = false;
299        for (yi, y_key) in y_keys.iter().enumerate() {
300            if x_key == y_key {
301                match_pairs.push((Some(xi), Some(yi)));
302                y_matched[yi] = true;
303                found = true;
304            }
305        }
306        if !found && all_x {
307            match_pairs.push((Some(xi), None));
308        }
309    }
310    if all_y {
311        for (yi, matched) in y_matched.iter().enumerate() {
312            if !matched {
313                match_pairs.push((None, Some(yi)));
314            }
315        }
316    }
317
318    let result_nrow = match_pairs.len();
319
320    // Determine which columns go into the result
321    let y_join_set: std::collections::HashSet<usize> = y_join_indices.iter().copied().collect();
322
323    // Build output columns: join keys, then x non-key cols, then y non-key cols
324    let mut output_columns: Vec<(Option<String>, RValue)> = Vec::new();
325
326    // Join key columns (from x, with fallback to y for right-join rows)
327    for (ki, &x_col_idx) in x_join_indices.iter().enumerate() {
328        let y_col_idx = y_join_indices[ki];
329        let name = x_names[x_col_idx].clone();
330        let x_col = &x_list.values[x_col_idx].1;
331        let y_col = &y_list.values[y_col_idx].1;
332        let values: Vec<Option<String>> = match_pairs
333            .iter()
334            .map(|(mx, my)| {
335                if let Some(xi) = mx {
336                    match x_col {
337                        RValue::Vector(rv) => rv.inner.to_characters().get(*xi).cloned().flatten(),
338                        _ => None,
339                    }
340                } else if let Some(yi) = my {
341                    match y_col {
342                        RValue::Vector(rv) => rv.inner.to_characters().get(*yi).cloned().flatten(),
343                        _ => None,
344                    }
345                } else {
346                    None
347                }
348            })
349            .collect();
350        output_columns.push((name, RValue::vec(Vector::Character(values.into()))));
351    }
352
353    // x non-key columns
354    let x_join_set: std::collections::HashSet<usize> = x_join_indices.iter().copied().collect();
355    for (col_idx, (name, val)) in x_list.values.iter().enumerate() {
356        if x_join_set.contains(&col_idx) {
357            continue;
358        }
359        let col_name = name.clone().or_else(|| Some(format!("x.{}", col_idx + 1)));
360        let new_val = select_rows_from_column(val, &match_pairs, true);
361        output_columns.push((col_name, new_val));
362    }
363
364    // y non-key columns
365    for (col_idx, (name, val)) in y_list.values.iter().enumerate() {
366        if y_join_set.contains(&col_idx) {
367            continue;
368        }
369        // Suffix duplicate names with ".y"
370        let base_name = name.clone().unwrap_or_else(|| format!("y.{}", col_idx + 1));
371        let col_name = if x_names.iter().any(|n| n.as_deref() == Some(&base_name))
372            && !x_join_set
373                .iter()
374                .any(|&i| x_names[i].as_deref() == Some(&base_name))
375        {
376            Some(format!("{}.y", base_name))
377        } else {
378            Some(base_name)
379        };
380        let new_val = select_rows_from_column(val, &match_pairs, false);
381        output_columns.push((col_name, new_val));
382    }
383
384    build_data_frame(output_columns, result_nrow)
385}
386
387/// Select rows from a column vector based on match pairs.
388fn select_rows_from_column(
389    val: &RValue,
390    pairs: &[(Option<usize>, Option<usize>)],
391    use_first: bool,
392) -> RValue {
393    match val {
394        RValue::Vector(rv) => {
395            let len = rv.inner.len();
396            let indices: Vec<Option<usize>> = pairs
397                .iter()
398                .map(|(mx, my)| if use_first { *mx } else { *my })
399                .collect();
400            match &rv.inner {
401                Vector::Character(vals) => {
402                    let result: Vec<Option<String>> = indices
403                        .iter()
404                        .map(|idx| idx.and_then(|i| if i < len { vals[i].clone() } else { None }))
405                        .collect();
406                    RValue::vec(Vector::Character(result.into()))
407                }
408                Vector::Double(vals) => {
409                    let result: Vec<Option<f64>> = indices
410                        .iter()
411                        .map(|idx| idx.and_then(|i| if i < len { vals.get_opt(i) } else { None }))
412                        .collect();
413                    RValue::vec(Vector::Double(result.into()))
414                }
415                Vector::Integer(vals) => {
416                    let result: Vec<Option<i64>> = indices
417                        .iter()
418                        .map(|idx| idx.and_then(|i| if i < len { vals.get_opt(i) } else { None }))
419                        .collect();
420                    RValue::vec(Vector::Integer(result.into()))
421                }
422                Vector::Logical(vals) => {
423                    let result: Vec<Option<bool>> = indices
424                        .iter()
425                        .map(|idx| idx.and_then(|i| if i < len { vals[i] } else { None }))
426                        .collect();
427                    RValue::vec(Vector::Logical(result.into()))
428                }
429                Vector::Complex(vals) => {
430                    let result: Vec<Option<num_complex::Complex64>> = indices
431                        .iter()
432                        .map(|idx| idx.and_then(|i| if i < len { vals[i] } else { None }))
433                        .collect();
434                    RValue::vec(Vector::Complex(result.into()))
435                }
436                Vector::Raw(vals) => {
437                    let result: Vec<u8> = indices
438                        .iter()
439                        .map(|idx| idx.map_or(0, |i| if i < len { vals[i] } else { 0 }))
440                        .collect();
441                    RValue::vec(Vector::Raw(result))
442                }
443            }
444        }
445        _ => RValue::Null,
446    }
447}
448
449// endregion
450
451// region: subset
452
453/// Subset a data frame by condition and/or column selection.
454///
455/// Evaluates the `subset` expression in the context of the data frame's columns,
456/// keeping rows where it evaluates to TRUE. The `select` argument specifies which
457/// columns to keep.
458///
459/// @param x a data frame
460/// @param subset logical expression evaluated in the data frame context
461/// @param select expression indicating columns to select (e.g., c(a, b) or -c)
462/// @return subsetted data frame
463#[pre_eval_builtin(min_args = 1)]
464fn pre_eval_subset(
465    args: &[Arg],
466    env: &Environment,
467    context: &BuiltinContext,
468) -> Result<RValue, RError> {
469    // First argument is the data frame (evaluate it)
470    let x_arg = args.first().and_then(|a| a.value.as_ref()).ok_or_else(|| {
471        RError::new(
472            RErrorKind::Argument,
473            "subset() requires a data frame argument".to_string(),
474        )
475    })?;
476
477    let x_val = context
478        .with_interpreter(|interp| interp.eval_in(x_arg, env))
479        .map_err(RError::from)?;
480
481    // Handle vector subset (non-data-frame case)
482    if let RValue::Vector(ref rv) = x_val {
483        if !has_class(&x_val, "data.frame") {
484            return subset_vector_impl(rv, args, env, context);
485        }
486    }
487
488    let RValue::List(ref x_list) = x_val else {
489        return Err(RError::new(
490            RErrorKind::Type,
491            "subset() requires a data frame or vector".to_string(),
492        ));
493    };
494
495    if !has_class(&x_val, "data.frame") {
496        return Err(RError::new(
497            RErrorKind::Type,
498            "subset() requires a data frame".to_string(),
499        ));
500    }
501
502    let nrow = df_nrow(x_list);
503    let col_names = df_col_names(x_list);
504
505    // Find subset and select arguments (by name or position)
506    let subset_expr = args
507        .iter()
508        .find(|a| a.name.as_deref() == Some("subset"))
509        .or_else(|| args.get(1).filter(|a| a.name.is_none()))
510        .and_then(|a| a.value.as_ref());
511
512    let select_expr = args
513        .iter()
514        .find(|a| a.name.as_deref() == Some("select"))
515        .or_else(|| args.get(2).filter(|a| a.name.is_none()))
516        .and_then(|a| a.value.as_ref());
517
518    // Evaluate subset condition in data frame context
519    let row_mask = if let Some(expr) = subset_expr {
520        let df_env = df_to_env(x_list, env);
521        let mask_val = context
522            .with_interpreter(|interp| interp.eval_in(expr, &df_env))
523            .map_err(RError::from)?;
524        match mask_val {
525            RValue::Vector(rv) => {
526                let logicals = rv.inner.to_logicals();
527                if logicals.len() != nrow {
528                    return Err(RError::new(
529                        RErrorKind::Argument,
530                        format!(
531                            "subset condition has length {} but data frame has {} rows",
532                            logicals.len(),
533                            nrow
534                        ),
535                    ));
536                }
537                logicals
538            }
539            _ => {
540                return Err(RError::new(
541                    RErrorKind::Type,
542                    "subset condition must evaluate to a logical vector".to_string(),
543                ))
544            }
545        }
546    } else {
547        vec![Some(true); nrow]
548    };
549
550    // Determine which columns to keep
551    let selected_cols = if let Some(expr) = select_expr {
552        resolve_column_selection(expr, &col_names)?
553    } else {
554        (0..col_names.len()).collect()
555    };
556
557    // Build result data frame
558    let result_nrow = row_mask.iter().filter(|m| **m == Some(true)).count();
559    let mut output_columns = Vec::new();
560    for &col_idx in &selected_cols {
561        let (name, val) = &x_list.values[col_idx];
562        let new_val = subset_rvalue_by_mask(val, &row_mask);
563        output_columns.push((name.clone(), new_val));
564    }
565
566    build_data_frame(output_columns, result_nrow)
567}
568
569/// Subset a plain vector by a logical condition.
570fn subset_vector_impl(
571    rv: &RVector,
572    args: &[Arg],
573    env: &Environment,
574    context: &BuiltinContext,
575) -> Result<RValue, RError> {
576    let subset_expr = args
577        .iter()
578        .find(|a| a.name.as_deref() == Some("subset"))
579        .or_else(|| args.get(1).filter(|a| a.name.is_none()))
580        .and_then(|a| a.value.as_ref());
581
582    if let Some(expr) = subset_expr {
583        let mask_val = context
584            .with_interpreter(|interp| interp.eval_in(expr, env))
585            .map_err(RError::from)?;
586        match mask_val {
587            RValue::Vector(mask_rv) => {
588                let logicals = mask_rv.inner.to_logicals();
589                let result = subset_vector_by_mask(&rv.inner, &logicals);
590                Ok(RValue::vec(result))
591            }
592            _ => Err(RError::new(
593                RErrorKind::Type,
594                "subset condition must evaluate to a logical vector".to_string(),
595            )),
596        }
597    } else {
598        Ok(RValue::Vector(rv.clone()))
599    }
600}
601
602/// Resolve a column selection expression to column indices.
603///
604/// Supports: `c(a, b)`, `-c(a, b)`, bare symbol, negative bare symbol.
605fn resolve_column_selection(
606    expr: &Expr,
607    col_names: &[Option<String>],
608) -> Result<Vec<usize>, RError> {
609    match expr {
610        // Negative selection: -c(col1, col2) or -col
611        Expr::UnaryOp {
612            op: crate::parser::ast::UnaryOp::Neg,
613            operand,
614        } => {
615            let exclude = resolve_column_selection(operand, col_names)?;
616            let exclude_set: std::collections::HashSet<usize> = exclude.into_iter().collect();
617            Ok((0..col_names.len())
618                .filter(|i| !exclude_set.contains(i))
619                .collect())
620        }
621        // c(col1, col2)
622        Expr::Call { func, args, .. } => {
623            if let Expr::Symbol(name) = func.as_ref() {
624                if name == "c" {
625                    let mut indices = Vec::new();
626                    for arg in args {
627                        if let Some(Expr::Symbol(col_name)) = arg.value.as_ref() {
628                            if let Some(idx) = col_names
629                                .iter()
630                                .position(|n| n.as_deref() == Some(col_name.as_str()))
631                            {
632                                indices.push(idx);
633                            } else {
634                                return Err(RError::new(
635                                    RErrorKind::Argument,
636                                    format!("column '{}' not found in data frame", col_name),
637                                ));
638                            }
639                        }
640                    }
641                    return Ok(indices);
642                }
643            }
644            Err(RError::new(
645                RErrorKind::Argument,
646                "select argument must be a column name or c(...) of column names".to_string(),
647            ))
648        }
649        // Bare symbol
650        Expr::Symbol(name) => {
651            if let Some(idx) = col_names
652                .iter()
653                .position(|n| n.as_deref() == Some(name.as_str()))
654            {
655                Ok(vec![idx])
656            } else {
657                Err(RError::new(
658                    RErrorKind::Argument,
659                    format!("column '{}' not found in data frame", name),
660                ))
661            }
662        }
663        _ => Err(RError::new(
664            RErrorKind::Argument,
665            "select argument must be a column name, c(...), or negation of these".to_string(),
666        )),
667    }
668}
669
670// endregion
671
672// region: transform
673
674/// Add or modify columns in a data frame.
675///
676/// Evaluates each named `...` argument as an expression in the data frame's
677/// column context, adding or replacing columns with the result.
678///
679/// @param _data a data frame
680/// @param ... named expressions to evaluate (column_name = expression)
681/// @return data frame with modified/added columns
682#[pre_eval_builtin(name = "transform", min_args = 1)]
683fn pre_eval_transform(
684    args: &[Arg],
685    env: &Environment,
686    context: &BuiltinContext,
687) -> Result<RValue, RError> {
688    let data_arg = args.first().and_then(|a| a.value.as_ref()).ok_or_else(|| {
689        RError::new(
690            RErrorKind::Argument,
691            "transform() requires a data frame argument".to_string(),
692        )
693    })?;
694
695    let data_val = context
696        .with_interpreter(|interp| interp.eval_in(data_arg, env))
697        .map_err(RError::from)?;
698
699    let RValue::List(mut data_list) = data_val.clone() else {
700        return Err(RError::new(
701            RErrorKind::Type,
702            "transform() requires a data frame".to_string(),
703        ));
704    };
705
706    if !has_class(&data_val, "data.frame") {
707        return Err(RError::new(
708            RErrorKind::Type,
709            "transform() requires a data frame".to_string(),
710        ));
711    }
712
713    let nrow = df_nrow(&data_list);
714    let col_names = df_col_names(&data_list);
715
716    // Process each named argument after the first
717    for arg in args.iter().skip(1) {
718        let Some(ref col_name) = arg.name else {
719            continue; // Skip unnamed args
720        };
721        let Some(ref expr) = arg.value else {
722            continue;
723        };
724
725        // Evaluate expression in data frame context
726        let df_env = df_to_env(&data_list, env);
727        let new_val = context
728            .with_interpreter(|interp| interp.eval_in(expr, &df_env))
729            .map_err(RError::from)?;
730
731        // Validate length
732        let val_len = new_val.length();
733        if val_len != nrow && val_len != 1 && nrow > 0 {
734            if !nrow.is_multiple_of(val_len) {
735                return Err(RError::new(
736                    RErrorKind::Argument,
737                    format!(
738                        "replacement has {} rows, data has {} — must be a divisor",
739                        val_len, nrow
740                    ),
741                ));
742            }
743            // Recycle
744            let recycled = recycle_value(&new_val, nrow)?;
745            if let Some(idx) = col_names
746                .iter()
747                .position(|n| n.as_deref() == Some(col_name.as_str()))
748            {
749                data_list.values[idx] = (Some(col_name.clone()), recycled);
750            } else {
751                data_list.values.push((Some(col_name.clone()), recycled));
752            }
753            continue;
754        }
755
756        // Add or replace column
757        if let Some(idx) = col_names
758            .iter()
759            .position(|n| n.as_deref() == Some(col_name.as_str()))
760        {
761            data_list.values[idx] = (Some(col_name.clone()), new_val);
762        } else {
763            data_list.values.push((Some(col_name.clone()), new_val));
764        }
765    }
766
767    // Rebuild data frame with updated names
768    let names: Vec<Option<String>> = data_list
769        .values
770        .iter()
771        .map(|(name, _)| name.clone())
772        .collect();
773    data_list.set_attr(
774        "names".to_string(),
775        RValue::vec(Vector::Character(names.into())),
776    );
777
778    Ok(RValue::List(data_list))
779}
780
781// endregion
782
783// region: with
784
785/// Evaluate an expression in the context of a data frame.
786///
787/// Creates an environment where the data frame's columns are accessible
788/// as variables, then evaluates `expr` in that environment.
789///
790/// @param data a data frame (or list)
791/// @param expr expression to evaluate
792/// @return the result of evaluating expr
793#[pre_eval_builtin(name = "with", min_args = 2)]
794fn pre_eval_with(
795    args: &[Arg],
796    env: &Environment,
797    context: &BuiltinContext,
798) -> Result<RValue, RError> {
799    let data_arg = args.first().and_then(|a| a.value.as_ref()).ok_or_else(|| {
800        RError::new(
801            RErrorKind::Argument,
802            "with() requires a data argument".to_string(),
803        )
804    })?;
805
806    let data_val = context
807        .with_interpreter(|interp| interp.eval_in(data_arg, env))
808        .map_err(RError::from)?;
809
810    let data_list = match &data_val {
811        RValue::List(l) => l,
812        _ => {
813            return Err(RError::new(
814                RErrorKind::Type,
815                "with() requires a list or data frame as first argument".to_string(),
816            ))
817        }
818    };
819
820    let expr = args.get(1).and_then(|a| a.value.as_ref()).ok_or_else(|| {
821        RError::new(
822            RErrorKind::Argument,
823            "with() requires an expression argument".to_string(),
824        )
825    })?;
826
827    let df_env = df_to_env(data_list, env);
828    context
829        .with_interpreter(|interp| interp.eval_in(expr, &df_env))
830        .map_err(RError::from)
831}
832
833// endregion
834
835// Note: split() is implemented in interp.rs as interp_split