1use crate::interpreter::environment::Environment;
9use crate::interpreter::value::*;
10use crate::interpreter::BuiltinContext;
11use crate::parser::ast::{Arg, Expr};
12use minir_macros::{builtin, pre_eval_builtin};
13
14use super::has_class;
15use super::pre_eval::recycle_value;
16
17pub(super) fn df_col_names(list: &RList) -> Vec<Option<String>> {
21 match list.get_attr("names") {
22 Some(RValue::Vector(rv)) => rv.inner.to_characters(),
23 _ => list.values.iter().map(|(name, _)| name.clone()).collect(),
24 }
25}
26
27pub(super) fn df_nrow(list: &RList) -> usize {
29 list.get_attr("row.names")
30 .map(RValue::length)
31 .unwrap_or_else(|| {
32 list.values
33 .iter()
34 .map(|(_, value)| value.length())
35 .max()
36 .unwrap_or(0)
37 })
38}
39
40pub(super) fn df_col_index(list: &RList, name: &str) -> Option<usize> {
42 let names = df_col_names(list);
43 names.iter().position(|n| n.as_deref() == Some(name))
44}
45
46fn subset_vector_by_mask(vec: &Vector, mask: &[Option<bool>]) -> Vector {
48 let indices: Vec<usize> = mask
49 .iter()
50 .enumerate()
51 .filter(|(_, keep)| **keep == Some(true))
52 .map(|(i, _)| i)
53 .collect();
54 vec.select_indices(&indices)
55}
56
57fn subset_rvalue_by_mask(val: &RValue, mask: &[Option<bool>]) -> RValue {
59 match val {
60 RValue::Vector(rv) => {
61 let new_inner = subset_vector_by_mask(&rv.inner, mask);
62 let mut new_rv = RVector::from(new_inner);
63 if let Some(attrs) = &rv.attrs {
65 for (k, v) in attrs.iter() {
66 if k != "names" && k != "dim" && k != "dimnames" {
67 new_rv.set_attr(k.clone(), v.clone());
68 }
69 }
70 }
71 if let Some(names_attr) = rv.get_attr("names") {
73 if let Some(names_vec) = names_attr.as_vector() {
74 let new_names = subset_vector_by_mask(names_vec, mask);
75 new_rv.set_attr("names".to_string(), RValue::vec(new_names));
76 }
77 }
78 RValue::Vector(new_rv)
79 }
80 other => other.clone(),
81 }
82}
83
84pub(super) fn build_data_frame(
86 columns: Vec<(Option<String>, RValue)>,
87 nrow: usize,
88) -> Result<RValue, RError> {
89 let names: Vec<Option<String>> = columns.iter().map(|(name, _)| name.clone()).collect();
90 let mut list = RList::new(columns);
91 list.set_attr(
92 "class".to_string(),
93 RValue::vec(Vector::Character(
94 vec![Some("data.frame".to_string())].into(),
95 )),
96 );
97 list.set_attr(
98 "names".to_string(),
99 RValue::vec(Vector::Character(names.into())),
100 );
101 list.set_attr(
102 "row.names".to_string(),
103 RValue::vec(Vector::Integer(
104 (1..=i64::try_from(nrow)?)
105 .map(Some)
106 .collect::<Vec<_>>()
107 .into(),
108 )),
109 );
110 Ok(RValue::List(list))
111}
112
113fn df_to_env(list: &RList, parent: &Environment) -> Environment {
115 let child = Environment::new_child(parent);
116 let names = df_col_names(list);
117 for (i, (_, val)) in list.values.iter().enumerate() {
118 if let Some(Some(name)) = names.get(i) {
119 child.set(name.clone(), val.clone());
120 }
121 }
122 child
123}
124
125fn coerce_to_string_vec(val: &RValue) -> Option<Vec<String>> {
127 match val {
128 RValue::Vector(rv) => {
129 let chars = rv.inner.to_characters();
130 let result: Vec<String> = chars.into_iter().flatten().collect();
131 if result.is_empty() {
132 None
133 } else {
134 Some(result)
135 }
136 }
137 _ => None,
138 }
139}
140
141#[builtin(min_args = 2)]
161fn builtin_merge(args: &[RValue], named: &[(String, RValue)]) -> Result<RValue, RError> {
162 let (x_list, y_list) = match (&args[0], &args[1]) {
163 (x @ RValue::List(xl), y @ RValue::List(yl))
164 if has_class(x, "data.frame") && has_class(y, "data.frame") =>
165 {
166 (xl, yl)
167 }
168 _ => {
169 return Err(RError::new(
170 RErrorKind::Type,
171 "merge() requires two data frames".to_string(),
172 ))
173 }
174 };
175
176 let by = named
178 .iter()
179 .find(|(k, _)| k == "by")
180 .map(|(_, v)| v)
181 .or(args.get(2))
182 .and_then(coerce_to_string_vec);
183 let by_x = named
184 .iter()
185 .find(|(k, _)| k == "by.x")
186 .map(|(_, v)| v)
187 .and_then(coerce_to_string_vec);
188 let by_y = named
189 .iter()
190 .find(|(k, _)| k == "by.y")
191 .map(|(_, v)| v)
192 .and_then(coerce_to_string_vec);
193 let all = named
194 .iter()
195 .find(|(k, _)| k == "all")
196 .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
197 .unwrap_or(false);
198 let all_x = named
199 .iter()
200 .find(|(k, _)| k == "all.x")
201 .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
202 .unwrap_or(all);
203 let all_y = named
204 .iter()
205 .find(|(k, _)| k == "all.y")
206 .and_then(|(_, v)| v.as_vector()?.as_logical_scalar())
207 .unwrap_or(all);
208
209 let x_names = df_col_names(x_list);
210 let y_names = df_col_names(y_list);
211
212 let (join_x, join_y) = match (by, by_x, by_y) {
214 (_, Some(bx), Some(by)) => (bx, by),
215 (Some(b), _, _) => (b.clone(), b),
216 (None, _, _) => {
217 let x_set: Vec<&str> = x_names.iter().filter_map(|n| n.as_deref()).collect();
219 let common: Vec<String> = y_names
220 .iter()
221 .filter_map(|n| n.as_deref())
222 .filter(|n| x_set.contains(n))
223 .map(|n| n.to_string())
224 .collect();
225 if common.is_empty() {
226 return Err(RError::new(
227 RErrorKind::Argument,
228 "no common columns to merge on — specify 'by', 'by.x', or 'by.y'".to_string(),
229 ));
230 }
231 (common.clone(), common)
232 }
233 };
234
235 if join_x.len() != join_y.len() {
236 return Err(RError::new(
237 RErrorKind::Argument,
238 "by.x and by.y must have the same length".to_string(),
239 ));
240 }
241
242 let x_join_indices: Vec<usize> = join_x
244 .iter()
245 .map(|name| {
246 df_col_index(x_list, name).ok_or_else(|| {
247 RError::new(
248 RErrorKind::Argument,
249 format!("column '{}' not found in x", name),
250 )
251 })
252 })
253 .collect::<Result<_, _>>()?;
254 let y_join_indices: Vec<usize> = join_y
255 .iter()
256 .map(|name| {
257 df_col_index(y_list, name).ok_or_else(|| {
258 RError::new(
259 RErrorKind::Argument,
260 format!("column '{}' not found in y", name),
261 )
262 })
263 })
264 .collect::<Result<_, _>>()?;
265
266 let x_nrow = df_nrow(x_list);
267 let y_nrow = df_nrow(y_list);
268
269 fn extract_keys(list: &RList, indices: &[usize], nrow: usize) -> Vec<Vec<Option<String>>> {
271 (0..nrow)
272 .map(|row| {
273 indices
274 .iter()
275 .map(|&col_idx| {
276 let (_, val) = &list.values[col_idx];
277 match val {
278 RValue::Vector(rv) => {
279 let chars = rv.inner.to_characters();
280 chars.get(row).cloned().flatten()
281 }
282 _ => None,
283 }
284 })
285 .collect()
286 })
287 .collect()
288 }
289
290 let x_keys = extract_keys(x_list, &x_join_indices, x_nrow);
291 let y_keys = extract_keys(y_list, &y_join_indices, y_nrow);
292
293 let mut match_pairs: Vec<(Option<usize>, Option<usize>)> = Vec::new();
295 let mut y_matched = vec![false; y_nrow];
296
297 for (xi, x_key) in x_keys.iter().enumerate() {
298 let mut found = false;
299 for (yi, y_key) in y_keys.iter().enumerate() {
300 if x_key == y_key {
301 match_pairs.push((Some(xi), Some(yi)));
302 y_matched[yi] = true;
303 found = true;
304 }
305 }
306 if !found && all_x {
307 match_pairs.push((Some(xi), None));
308 }
309 }
310 if all_y {
311 for (yi, matched) in y_matched.iter().enumerate() {
312 if !matched {
313 match_pairs.push((None, Some(yi)));
314 }
315 }
316 }
317
318 let result_nrow = match_pairs.len();
319
320 let y_join_set: std::collections::HashSet<usize> = y_join_indices.iter().copied().collect();
322
323 let mut output_columns: Vec<(Option<String>, RValue)> = Vec::new();
325
326 for (ki, &x_col_idx) in x_join_indices.iter().enumerate() {
328 let y_col_idx = y_join_indices[ki];
329 let name = x_names[x_col_idx].clone();
330 let x_col = &x_list.values[x_col_idx].1;
331 let y_col = &y_list.values[y_col_idx].1;
332 let values: Vec<Option<String>> = match_pairs
333 .iter()
334 .map(|(mx, my)| {
335 if let Some(xi) = mx {
336 match x_col {
337 RValue::Vector(rv) => rv.inner.to_characters().get(*xi).cloned().flatten(),
338 _ => None,
339 }
340 } else if let Some(yi) = my {
341 match y_col {
342 RValue::Vector(rv) => rv.inner.to_characters().get(*yi).cloned().flatten(),
343 _ => None,
344 }
345 } else {
346 None
347 }
348 })
349 .collect();
350 output_columns.push((name, RValue::vec(Vector::Character(values.into()))));
351 }
352
353 let x_join_set: std::collections::HashSet<usize> = x_join_indices.iter().copied().collect();
355 for (col_idx, (name, val)) in x_list.values.iter().enumerate() {
356 if x_join_set.contains(&col_idx) {
357 continue;
358 }
359 let col_name = name.clone().or_else(|| Some(format!("x.{}", col_idx + 1)));
360 let new_val = select_rows_from_column(val, &match_pairs, true);
361 output_columns.push((col_name, new_val));
362 }
363
364 for (col_idx, (name, val)) in y_list.values.iter().enumerate() {
366 if y_join_set.contains(&col_idx) {
367 continue;
368 }
369 let base_name = name.clone().unwrap_or_else(|| format!("y.{}", col_idx + 1));
371 let col_name = if x_names.iter().any(|n| n.as_deref() == Some(&base_name))
372 && !x_join_set
373 .iter()
374 .any(|&i| x_names[i].as_deref() == Some(&base_name))
375 {
376 Some(format!("{}.y", base_name))
377 } else {
378 Some(base_name)
379 };
380 let new_val = select_rows_from_column(val, &match_pairs, false);
381 output_columns.push((col_name, new_val));
382 }
383
384 build_data_frame(output_columns, result_nrow)
385}
386
387fn select_rows_from_column(
389 val: &RValue,
390 pairs: &[(Option<usize>, Option<usize>)],
391 use_first: bool,
392) -> RValue {
393 match val {
394 RValue::Vector(rv) => {
395 let len = rv.inner.len();
396 let indices: Vec<Option<usize>> = pairs
397 .iter()
398 .map(|(mx, my)| if use_first { *mx } else { *my })
399 .collect();
400 match &rv.inner {
401 Vector::Character(vals) => {
402 let result: Vec<Option<String>> = indices
403 .iter()
404 .map(|idx| idx.and_then(|i| if i < len { vals[i].clone() } else { None }))
405 .collect();
406 RValue::vec(Vector::Character(result.into()))
407 }
408 Vector::Double(vals) => {
409 let result: Vec<Option<f64>> = indices
410 .iter()
411 .map(|idx| idx.and_then(|i| if i < len { vals.get_opt(i) } else { None }))
412 .collect();
413 RValue::vec(Vector::Double(result.into()))
414 }
415 Vector::Integer(vals) => {
416 let result: Vec<Option<i64>> = indices
417 .iter()
418 .map(|idx| idx.and_then(|i| if i < len { vals.get_opt(i) } else { None }))
419 .collect();
420 RValue::vec(Vector::Integer(result.into()))
421 }
422 Vector::Logical(vals) => {
423 let result: Vec<Option<bool>> = indices
424 .iter()
425 .map(|idx| idx.and_then(|i| if i < len { vals[i] } else { None }))
426 .collect();
427 RValue::vec(Vector::Logical(result.into()))
428 }
429 Vector::Complex(vals) => {
430 let result: Vec<Option<num_complex::Complex64>> = indices
431 .iter()
432 .map(|idx| idx.and_then(|i| if i < len { vals[i] } else { None }))
433 .collect();
434 RValue::vec(Vector::Complex(result.into()))
435 }
436 Vector::Raw(vals) => {
437 let result: Vec<u8> = indices
438 .iter()
439 .map(|idx| idx.map_or(0, |i| if i < len { vals[i] } else { 0 }))
440 .collect();
441 RValue::vec(Vector::Raw(result))
442 }
443 }
444 }
445 _ => RValue::Null,
446 }
447}
448
449#[pre_eval_builtin(min_args = 1)]
464fn pre_eval_subset(
465 args: &[Arg],
466 env: &Environment,
467 context: &BuiltinContext,
468) -> Result<RValue, RError> {
469 let x_arg = args.first().and_then(|a| a.value.as_ref()).ok_or_else(|| {
471 RError::new(
472 RErrorKind::Argument,
473 "subset() requires a data frame argument".to_string(),
474 )
475 })?;
476
477 let x_val = context
478 .with_interpreter(|interp| interp.eval_in(x_arg, env))
479 .map_err(RError::from)?;
480
481 if let RValue::Vector(ref rv) = x_val {
483 if !has_class(&x_val, "data.frame") {
484 return subset_vector_impl(rv, args, env, context);
485 }
486 }
487
488 let RValue::List(ref x_list) = x_val else {
489 return Err(RError::new(
490 RErrorKind::Type,
491 "subset() requires a data frame or vector".to_string(),
492 ));
493 };
494
495 if !has_class(&x_val, "data.frame") {
496 return Err(RError::new(
497 RErrorKind::Type,
498 "subset() requires a data frame".to_string(),
499 ));
500 }
501
502 let nrow = df_nrow(x_list);
503 let col_names = df_col_names(x_list);
504
505 let subset_expr = args
507 .iter()
508 .find(|a| a.name.as_deref() == Some("subset"))
509 .or_else(|| args.get(1).filter(|a| a.name.is_none()))
510 .and_then(|a| a.value.as_ref());
511
512 let select_expr = args
513 .iter()
514 .find(|a| a.name.as_deref() == Some("select"))
515 .or_else(|| args.get(2).filter(|a| a.name.is_none()))
516 .and_then(|a| a.value.as_ref());
517
518 let row_mask = if let Some(expr) = subset_expr {
520 let df_env = df_to_env(x_list, env);
521 let mask_val = context
522 .with_interpreter(|interp| interp.eval_in(expr, &df_env))
523 .map_err(RError::from)?;
524 match mask_val {
525 RValue::Vector(rv) => {
526 let logicals = rv.inner.to_logicals();
527 if logicals.len() != nrow {
528 return Err(RError::new(
529 RErrorKind::Argument,
530 format!(
531 "subset condition has length {} but data frame has {} rows",
532 logicals.len(),
533 nrow
534 ),
535 ));
536 }
537 logicals
538 }
539 _ => {
540 return Err(RError::new(
541 RErrorKind::Type,
542 "subset condition must evaluate to a logical vector".to_string(),
543 ))
544 }
545 }
546 } else {
547 vec![Some(true); nrow]
548 };
549
550 let selected_cols = if let Some(expr) = select_expr {
552 resolve_column_selection(expr, &col_names)?
553 } else {
554 (0..col_names.len()).collect()
555 };
556
557 let result_nrow = row_mask.iter().filter(|m| **m == Some(true)).count();
559 let mut output_columns = Vec::new();
560 for &col_idx in &selected_cols {
561 let (name, val) = &x_list.values[col_idx];
562 let new_val = subset_rvalue_by_mask(val, &row_mask);
563 output_columns.push((name.clone(), new_val));
564 }
565
566 build_data_frame(output_columns, result_nrow)
567}
568
569fn subset_vector_impl(
571 rv: &RVector,
572 args: &[Arg],
573 env: &Environment,
574 context: &BuiltinContext,
575) -> Result<RValue, RError> {
576 let subset_expr = args
577 .iter()
578 .find(|a| a.name.as_deref() == Some("subset"))
579 .or_else(|| args.get(1).filter(|a| a.name.is_none()))
580 .and_then(|a| a.value.as_ref());
581
582 if let Some(expr) = subset_expr {
583 let mask_val = context
584 .with_interpreter(|interp| interp.eval_in(expr, env))
585 .map_err(RError::from)?;
586 match mask_val {
587 RValue::Vector(mask_rv) => {
588 let logicals = mask_rv.inner.to_logicals();
589 let result = subset_vector_by_mask(&rv.inner, &logicals);
590 Ok(RValue::vec(result))
591 }
592 _ => Err(RError::new(
593 RErrorKind::Type,
594 "subset condition must evaluate to a logical vector".to_string(),
595 )),
596 }
597 } else {
598 Ok(RValue::Vector(rv.clone()))
599 }
600}
601
602fn resolve_column_selection(
606 expr: &Expr,
607 col_names: &[Option<String>],
608) -> Result<Vec<usize>, RError> {
609 match expr {
610 Expr::UnaryOp {
612 op: crate::parser::ast::UnaryOp::Neg,
613 operand,
614 } => {
615 let exclude = resolve_column_selection(operand, col_names)?;
616 let exclude_set: std::collections::HashSet<usize> = exclude.into_iter().collect();
617 Ok((0..col_names.len())
618 .filter(|i| !exclude_set.contains(i))
619 .collect())
620 }
621 Expr::Call { func, args, .. } => {
623 if let Expr::Symbol(name) = func.as_ref() {
624 if name == "c" {
625 let mut indices = Vec::new();
626 for arg in args {
627 if let Some(Expr::Symbol(col_name)) = arg.value.as_ref() {
628 if let Some(idx) = col_names
629 .iter()
630 .position(|n| n.as_deref() == Some(col_name.as_str()))
631 {
632 indices.push(idx);
633 } else {
634 return Err(RError::new(
635 RErrorKind::Argument,
636 format!("column '{}' not found in data frame", col_name),
637 ));
638 }
639 }
640 }
641 return Ok(indices);
642 }
643 }
644 Err(RError::new(
645 RErrorKind::Argument,
646 "select argument must be a column name or c(...) of column names".to_string(),
647 ))
648 }
649 Expr::Symbol(name) => {
651 if let Some(idx) = col_names
652 .iter()
653 .position(|n| n.as_deref() == Some(name.as_str()))
654 {
655 Ok(vec![idx])
656 } else {
657 Err(RError::new(
658 RErrorKind::Argument,
659 format!("column '{}' not found in data frame", name),
660 ))
661 }
662 }
663 _ => Err(RError::new(
664 RErrorKind::Argument,
665 "select argument must be a column name, c(...), or negation of these".to_string(),
666 )),
667 }
668}
669
670#[pre_eval_builtin(name = "transform", min_args = 1)]
683fn pre_eval_transform(
684 args: &[Arg],
685 env: &Environment,
686 context: &BuiltinContext,
687) -> Result<RValue, RError> {
688 let data_arg = args.first().and_then(|a| a.value.as_ref()).ok_or_else(|| {
689 RError::new(
690 RErrorKind::Argument,
691 "transform() requires a data frame argument".to_string(),
692 )
693 })?;
694
695 let data_val = context
696 .with_interpreter(|interp| interp.eval_in(data_arg, env))
697 .map_err(RError::from)?;
698
699 let RValue::List(mut data_list) = data_val.clone() else {
700 return Err(RError::new(
701 RErrorKind::Type,
702 "transform() requires a data frame".to_string(),
703 ));
704 };
705
706 if !has_class(&data_val, "data.frame") {
707 return Err(RError::new(
708 RErrorKind::Type,
709 "transform() requires a data frame".to_string(),
710 ));
711 }
712
713 let nrow = df_nrow(&data_list);
714 let col_names = df_col_names(&data_list);
715
716 for arg in args.iter().skip(1) {
718 let Some(ref col_name) = arg.name else {
719 continue; };
721 let Some(ref expr) = arg.value else {
722 continue;
723 };
724
725 let df_env = df_to_env(&data_list, env);
727 let new_val = context
728 .with_interpreter(|interp| interp.eval_in(expr, &df_env))
729 .map_err(RError::from)?;
730
731 let val_len = new_val.length();
733 if val_len != nrow && val_len != 1 && nrow > 0 {
734 if !nrow.is_multiple_of(val_len) {
735 return Err(RError::new(
736 RErrorKind::Argument,
737 format!(
738 "replacement has {} rows, data has {} — must be a divisor",
739 val_len, nrow
740 ),
741 ));
742 }
743 let recycled = recycle_value(&new_val, nrow)?;
745 if let Some(idx) = col_names
746 .iter()
747 .position(|n| n.as_deref() == Some(col_name.as_str()))
748 {
749 data_list.values[idx] = (Some(col_name.clone()), recycled);
750 } else {
751 data_list.values.push((Some(col_name.clone()), recycled));
752 }
753 continue;
754 }
755
756 if let Some(idx) = col_names
758 .iter()
759 .position(|n| n.as_deref() == Some(col_name.as_str()))
760 {
761 data_list.values[idx] = (Some(col_name.clone()), new_val);
762 } else {
763 data_list.values.push((Some(col_name.clone()), new_val));
764 }
765 }
766
767 let names: Vec<Option<String>> = data_list
769 .values
770 .iter()
771 .map(|(name, _)| name.clone())
772 .collect();
773 data_list.set_attr(
774 "names".to_string(),
775 RValue::vec(Vector::Character(names.into())),
776 );
777
778 Ok(RValue::List(data_list))
779}
780
781#[pre_eval_builtin(name = "with", min_args = 2)]
794fn pre_eval_with(
795 args: &[Arg],
796 env: &Environment,
797 context: &BuiltinContext,
798) -> Result<RValue, RError> {
799 let data_arg = args.first().and_then(|a| a.value.as_ref()).ok_or_else(|| {
800 RError::new(
801 RErrorKind::Argument,
802 "with() requires a data argument".to_string(),
803 )
804 })?;
805
806 let data_val = context
807 .with_interpreter(|interp| interp.eval_in(data_arg, env))
808 .map_err(RError::from)?;
809
810 let data_list = match &data_val {
811 RValue::List(l) => l,
812 _ => {
813 return Err(RError::new(
814 RErrorKind::Type,
815 "with() requires a list or data frame as first argument".to_string(),
816 ))
817 }
818 };
819
820 let expr = args.get(1).and_then(|a| a.value.as_ref()).ok_or_else(|| {
821 RError::new(
822 RErrorKind::Argument,
823 "with() requires an expression argument".to_string(),
824 )
825 })?;
826
827 let df_env = df_to_env(data_list, env);
828 context
829 .with_interpreter(|interp| interp.eval_in(expr, &df_env))
830 .map_err(RError::from)
831}
832
833