miniextendr_api/dataframe.rs
1//! Runtime wrapper for R `data.frame` objects.
2//!
3//! Provides [`DataFrameView`], a typed wrapper around an R `data.frame` SEXP backed
4//! by [`NamedList`] for O(1) column access by name. This complements the existing
5//! [`DataFrame<T>`](crate::convert::DataFrame) which handles row-to-column transposition
6//! for *creating* data frames; `DataFrameView` is for *receiving* and inspecting them.
7//!
8//! # Example
9//!
10//! ```ignore
11//! use miniextendr_api::dataframe::DataFrameView;
12//! use miniextendr_api::typed_list::{TypedListSpec, TypedEntry, TypeSpec};
13//!
14//! #[miniextendr]
15//! fn summarize(df: DataFrameView) -> f64 {
16//! let x: Vec<f64> = df.column("x").unwrap();
17//! x.iter().sum()
18//! }
19//! ```
20
21use crate::ffi::{self, SEXP, SEXPTYPE, SexpExt};
22use crate::from_r::{SexpError, TryFromSexp};
23use crate::into_r::IntoR;
24use crate::list::{List, NamedList};
25use crate::typed_list::{TypedList, TypedListError, TypedListSpec, validate_list};
26use std::ffi::CStr;
27
28// region: Error type
29
30/// Error returned when constructing or validating an [`DataFrameView`].
31#[derive(Debug, Clone)]
32pub enum DataFrameError {
33 /// The SEXP is not a VECSXP.
34 NotList(String),
35 /// The object does not inherit from `data.frame`.
36 NotDataFrame,
37 /// The list has no `names` attribute (columns must be named).
38 NoNames,
39 /// Could not extract `nrow` from `row.names` attribute.
40 BadRowNames(String),
41 /// Columns have unequal lengths (when promoting from NamedList).
42 UnequalLengths {
43 /// First column length encountered.
44 expected: usize,
45 /// The column name that differs.
46 column: String,
47 /// The actual length of that column.
48 actual: usize,
49 },
50}
51
52impl std::fmt::Display for DataFrameError {
53 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54 match self {
55 DataFrameError::NotList(msg) => write!(f, "not a list: {}", msg),
56 DataFrameError::NotDataFrame => write!(f, "object does not inherit from data.frame"),
57 DataFrameError::NoNames => write!(f, "data.frame has no column names"),
58 DataFrameError::BadRowNames(msg) => {
59 write!(f, "could not extract nrow from row.names: {}", msg)
60 }
61 DataFrameError::UnequalLengths {
62 expected,
63 column,
64 actual,
65 } => write!(
66 f,
67 "column {:?} has length {} (expected {})",
68 column, actual, expected
69 ),
70 }
71 }
72}
73
74impl std::error::Error for DataFrameError {}
75// endregion
76
77// region: DataFrameView
78
79/// A validated R `data.frame` backed by [`NamedList`] for O(1) column access.
80///
81/// This type wraps an existing R data.frame SEXP and provides:
82/// - O(1) column access by name via the [`NamedList`] index
83/// - O(1) column access by position
84/// - Schema validation via [`TypedListSpec`]
85/// - Conversion back to [`NamedList`] or raw SEXP
86///
87/// # Construction
88///
89/// Use [`DataFrameView::from_sexp`] to wrap an existing R data.frame, or
90/// [`NamedList::as_data_frame`] / [`List::as_data_frame`] to promote a list.
91///
92/// # Relationship to `DataFrame<T>`
93///
94/// [`DataFrame<T>`](crate::convert::DataFrame) is for *creating* data frames from
95/// row-oriented Rust data (implements `IntoDataFrame`). `DataFrameView` is for
96/// *receiving* data frames from R and inspecting their contents.
97pub struct DataFrameView {
98 inner: NamedList,
99 nrow: usize,
100}
101
102impl DataFrameView {
103 /// Wrap an existing R `data.frame` SEXP.
104 ///
105 /// Validates that the object:
106 /// 1. Is a VECSXP (list)
107 /// 2. Inherits from `"data.frame"`
108 /// 3. Has a `names` attribute
109 /// 4. Has extractable `row.names` for nrow
110 ///
111 /// # Errors
112 ///
113 /// Returns [`DataFrameError`] if validation fails.
114 pub fn from_sexp(sexp: SEXP) -> Result<Self, DataFrameError> {
115 // 1. Check it's a list (VECSXP)
116 let stype = sexp.type_of();
117 if stype != SEXPTYPE::VECSXP {
118 return Err(DataFrameError::NotList(format!(
119 "expected VECSXP, got {:?}",
120 stype
121 )));
122 }
123
124 // 2. Check it inherits from data.frame
125 let inherits = sexp.is_data_frame();
126 if !inherits {
127 return Err(DataFrameError::NotDataFrame);
128 }
129
130 // 3. Build NamedList (requires names attribute)
131 let list = unsafe { List::from_raw(sexp) };
132 let inner = NamedList::new(list).ok_or(DataFrameError::NoNames)?;
133
134 // 4. Extract nrow from row.names
135 let nrow = extract_nrow(sexp)?;
136
137 Ok(Self { inner, nrow })
138 }
139
140 /// Construct an `DataFrameView` from a [`NamedList`] and a pre-validated nrow.
141 ///
142 /// This is used internally by [`NamedList::as_data_frame`] after validation.
143 fn from_named_list(inner: NamedList, nrow: usize) -> Self {
144 Self { inner, nrow }
145 }
146
147 // region: Column access
148
149 /// Get a column by name, converting each element to type `T`.
150 ///
151 /// Uses O(1) name lookup via the [`NamedList`] index.
152 ///
153 /// Returns `None` if the column name is not found or conversion fails.
154 ///
155 /// # Example
156 ///
157 /// ```ignore
158 /// let x: Option<Vec<f64>> = df.column("x");
159 /// ```
160 #[inline]
161 pub fn column<T>(&self, name: &str) -> Option<T>
162 where
163 T: TryFromSexp<Error = SexpError>,
164 {
165 self.inner.get(name)
166 }
167
168 /// Get a column by 0-based index, converting to type `T`.
169 ///
170 /// Returns `None` if the index is out of bounds or conversion fails.
171 #[inline]
172 pub fn column_index<T>(&self, idx: usize) -> Option<T>
173 where
174 T: TryFromSexp<Error = SexpError>,
175 {
176 let idx_isize: isize = idx.try_into().ok()?;
177 self.inner.get_index(idx_isize)
178 }
179
180 /// Get the raw SEXP for a column by name.
181 ///
182 /// Returns `None` if the column name is not found.
183 #[inline]
184 pub fn column_raw(&self, name: &str) -> Option<SEXP> {
185 self.inner.get_raw(name)
186 }
187 // endregion
188
189 // region: Accessors
190
191 /// Number of rows.
192 #[inline]
193 pub fn nrow(&self) -> usize {
194 self.nrow
195 }
196
197 /// Number of columns (number of named elements in the list).
198 #[inline]
199 pub fn ncol(&self) -> usize {
200 self.inner.named_len()
201 }
202
203 /// Iterate over column names (unordered, from the HashMap index).
204 pub fn names(&self) -> impl Iterator<Item = &str> {
205 self.inner.names()
206 }
207
208 /// Check if a column name exists.
209 #[inline]
210 pub fn contains_column(&self, name: &str) -> bool {
211 self.inner.contains(name)
212 }
213 // endregion
214
215 // region: Validation
216
217 /// Validate the data frame's column types against a [`TypedListSpec`].
218 ///
219 /// This bridges the [`typed_list`](crate::typed_list!) validation infrastructure
220 /// to data frames, allowing schema checks like:
221 ///
222 /// ```ignore
223 /// let spec = TypedListSpec::new(vec![
224 /// TypedEntry::required("x", TypeSpec::Numeric(None)),
225 /// TypedEntry::required("y", TypeSpec::Integer(None)),
226 /// ]);
227 /// df.validate(&spec)?;
228 /// ```
229 pub fn validate(&self, spec: &TypedListSpec) -> Result<TypedList, TypedListError> {
230 validate_list(self.inner.as_list(), spec)
231 }
232 // endregion
233
234 // region: Conversions
235
236 /// Convert to the underlying [`NamedList`], consuming the data frame wrapper.
237 ///
238 /// The SEXP retains its `data.frame` class attribute.
239 #[inline]
240 pub fn into_named_list(self) -> NamedList {
241 self.inner
242 }
243
244 /// Get the underlying [`List`].
245 #[inline]
246 pub fn as_list(&self) -> List {
247 self.inner.as_list()
248 }
249
250 /// Get the underlying SEXP.
251 #[inline]
252 pub fn as_sexp(&self) -> SEXP {
253 self.inner.as_list().as_sexp()
254 }
255 // endregion
256}
257// endregion
258
259// region: TryFromSexp for DataFrameView
260
261impl TryFromSexp for DataFrameView {
262 type Error = SexpError;
263
264 fn try_from_sexp(sexp: SEXP) -> Result<Self, Self::Error> {
265 DataFrameView::from_sexp(sexp).map_err(|e| SexpError::InvalidValue(e.to_string()))
266 }
267}
268// endregion
269
270// region: IntoR for DataFrameView — returns the backing SEXP unchanged
271
272impl IntoR for DataFrameView {
273 type Error = std::convert::Infallible;
274 fn try_into_sexp(self) -> Result<SEXP, Self::Error> {
275 Ok(self.into_sexp())
276 }
277 unsafe fn try_into_sexp_unchecked(self) -> Result<SEXP, Self::Error> {
278 self.try_into_sexp()
279 }
280 #[inline]
281 fn into_sexp(self) -> SEXP {
282 self.inner.as_list().as_sexp()
283 }
284}
285// endregion
286
287// region: nrow extraction from row.names
288
289/// Extract `nrow` from R's `row.names` attribute.
290///
291/// R data frames store row.names in two forms:
292/// 1. **Compact integer form**: `c(NA_integer_, -n)` — meaning `1:n` row names
293/// 2. **Explicit form**: A character or integer vector of length n
294///
295/// If no row.names attribute exists, we fall back to the length of the first column.
296fn extract_nrow(sexp: SEXP) -> Result<usize, DataFrameError> {
297 let row_names = sexp.get_row_names();
298
299 if row_names.is_nil() {
300 // No row.names — fall back to first column length
301 return nrow_from_first_column(sexp);
302 }
303
304 let rn_type = row_names.type_of();
305 let rn_len = unsafe { ffi::Rf_xlength(row_names) };
306
307 // Compact integer form: c(NA_integer_, -n) where n is the row count
308 if rn_type == SEXPTYPE::INTSXP && rn_len == 2 {
309 let rn: &[i32] = unsafe { row_names.as_slice() };
310
311 // NA_INTEGER = i32::MIN, second is -nrow
312 if rn[0] == i32::MIN && rn[1] < 0 {
313 return Ok((-rn[1]) as usize);
314 }
315 // Not compact form, but an actual 2-element integer vector — nrow = 2
316 // (This is unusual but valid)
317 }
318
319 // Explicit form: nrow = length of row.names vector
320 if let Ok(n) = usize::try_from(rn_len) {
321 Ok(n)
322 } else {
323 Err(DataFrameError::BadRowNames(format!(
324 "row.names has negative length: {}",
325 rn_len
326 )))
327 }
328}
329
330/// Fall back: extract nrow from the length of the first column.
331fn nrow_from_first_column(sexp: SEXP) -> Result<usize, DataFrameError> {
332 let ncol = unsafe { ffi::Rf_xlength(sexp) };
333 if ncol == 0 {
334 // 0 columns → 0 rows
335 return Ok(0);
336 }
337 let first_col = sexp.vector_elt(0);
338 if first_col == SEXP::nil() {
339 return Ok(0);
340 }
341 let len = unsafe { ffi::Rf_xlength(first_col) };
342 if let Ok(n) = usize::try_from(len) {
343 Ok(n)
344 } else {
345 Err(DataFrameError::BadRowNames(
346 "first column has negative length".to_string(),
347 ))
348 }
349}
350// endregion
351
352// region: NamedList → DataFrameView promotion
353
354/// Validate that all columns in a NamedList have equal length, returning the common length.
355fn validate_equal_lengths(named: &NamedList) -> Result<usize, DataFrameError> {
356 let list = named.as_list();
357 let n = list.len();
358
359 if n == 0 {
360 return Ok(0);
361 }
362
363 // Get the length of the first column
364 let first_col = list.as_sexp().vector_elt(0);
365 let expected: usize = unsafe { ffi::Rf_xlength(first_col) }
366 .try_into()
367 .expect("column length must be non-negative");
368
369 // Check all columns match
370 let names_sexp = list.names();
371 for i in 1..n {
372 let col = list.as_sexp().vector_elt(i);
373 let col_len: usize = unsafe { ffi::Rf_xlength(col) }
374 .try_into()
375 .expect("column length must be non-negative");
376 if col_len != expected {
377 // Try to get the column name for the error message
378 let col_name = if let Some(names) = names_sexp {
379 let name_sexp = names.string_elt(i);
380 if name_sexp != SEXP::na_string() {
381 let name_ptr = name_sexp.r_char();
382 let name_cstr = unsafe { CStr::from_ptr(name_ptr) };
383 name_cstr.to_str().unwrap_or("<invalid>").to_string()
384 } else {
385 format!("column {}", i)
386 }
387 } else {
388 format!("column {}", i)
389 };
390
391 return Err(DataFrameError::UnequalLengths {
392 expected,
393 column: col_name,
394 actual: col_len,
395 });
396 }
397 }
398
399 Ok(expected)
400}
401
402impl NamedList {
403 /// Promote this named list to a [`DataFrameView`].
404 ///
405 /// Validates that all columns have equal length, then sets the `class`
406 /// attribute to `"data.frame"` and adds compact integer `row.names`.
407 ///
408 /// # Errors
409 ///
410 /// Returns [`DataFrameError::UnequalLengths`] if columns differ in length.
411 pub fn as_data_frame(&self) -> Result<DataFrameView, DataFrameError> {
412 let nrow = validate_equal_lengths(self)?;
413
414 // Set class attribute to "data.frame"
415 self.as_list().set_data_frame_class();
416
417 // Set compact row.names: c(NA_integer_, -nrow)
418 self.as_list().set_row_names_int(nrow);
419
420 // Clone the NamedList (List is Copy, we rebuild the HashMap index)
421 let inner = NamedList::new(self.as_list())
422 .expect("NamedList already has names; promotion should not lose them");
423
424 Ok(DataFrameView::from_named_list(inner, nrow))
425 }
426}
427
428impl List {
429 /// Promote this named list to a [`DataFrameView`].
430 ///
431 /// The list must have a `names` attribute and all columns must have equal length.
432 ///
433 /// # Errors
434 ///
435 /// Returns [`DataFrameError`] if the list has no names or columns differ in length.
436 pub fn as_data_frame(&self) -> Result<DataFrameView, DataFrameError> {
437 let named = NamedList::new(*self).ok_or(DataFrameError::NoNames)?;
438 named.as_data_frame()
439 }
440}
441// endregion
442
443// region: Debug impl
444
445impl std::fmt::Debug for DataFrameView {
446 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
447 f.debug_struct("DataFrameView")
448 .field("nrow", &self.nrow)
449 .field("ncol", &self.ncol())
450 .finish()
451 }
452}
453
454#[cfg(test)]
455mod tests {
456 use super::*;
457
458 #[test]
459 fn test_data_frame_error_display() {
460 let err = DataFrameError::NotDataFrame;
461 assert_eq!(err.to_string(), "object does not inherit from data.frame");
462
463 let err = DataFrameError::NoNames;
464 assert_eq!(err.to_string(), "data.frame has no column names");
465
466 let err = DataFrameError::UnequalLengths {
467 expected: 3,
468 column: "y".to_string(),
469 actual: 5,
470 };
471 assert_eq!(err.to_string(), "column \"y\" has length 5 (expected 3)");
472 }
473}
474// endregion