Skip to main content

miniextendr_api/
dataframe.rs

1//! Runtime wrapper for R `data.frame` objects.
2//!
3//! Provides [`DataFrameView`], a typed wrapper around an R `data.frame` SEXP backed
4//! by [`NamedList`] for O(1) column access by name. This complements the existing
5//! [`DataFrame<T>`](crate::convert::DataFrame) which handles row-to-column transposition
6//! for *creating* data frames; `DataFrameView` is for *receiving* and inspecting them.
7//!
8//! # Example
9//!
10//! ```ignore
11//! use miniextendr_api::dataframe::DataFrameView;
12//! use miniextendr_api::typed_list::{TypedListSpec, TypedEntry, TypeSpec};
13//!
14//! #[miniextendr]
15//! fn summarize(df: DataFrameView) -> f64 {
16//!     let x: Vec<f64> = df.column("x").unwrap();
17//!     x.iter().sum()
18//! }
19//! ```
20
21use crate::ffi::{self, SEXP, SEXPTYPE, SexpExt};
22use crate::from_r::{SexpError, TryFromSexp};
23use crate::into_r::IntoR;
24use crate::list::{List, NamedList};
25use crate::typed_list::{TypedList, TypedListError, TypedListSpec, validate_list};
26use std::ffi::CStr;
27
28// region: Error type
29
30/// Error returned when constructing or validating an [`DataFrameView`].
31#[derive(Debug, Clone)]
32pub enum DataFrameError {
33    /// The SEXP is not a VECSXP.
34    NotList(String),
35    /// The object does not inherit from `data.frame`.
36    NotDataFrame,
37    /// The list has no `names` attribute (columns must be named).
38    NoNames,
39    /// Could not extract `nrow` from `row.names` attribute.
40    BadRowNames(String),
41    /// Columns have unequal lengths (when promoting from NamedList).
42    UnequalLengths {
43        /// First column length encountered.
44        expected: usize,
45        /// The column name that differs.
46        column: String,
47        /// The actual length of that column.
48        actual: usize,
49    },
50}
51
52impl std::fmt::Display for DataFrameError {
53    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
54        match self {
55            DataFrameError::NotList(msg) => write!(f, "not a list: {}", msg),
56            DataFrameError::NotDataFrame => write!(f, "object does not inherit from data.frame"),
57            DataFrameError::NoNames => write!(f, "data.frame has no column names"),
58            DataFrameError::BadRowNames(msg) => {
59                write!(f, "could not extract nrow from row.names: {}", msg)
60            }
61            DataFrameError::UnequalLengths {
62                expected,
63                column,
64                actual,
65            } => write!(
66                f,
67                "column {:?} has length {} (expected {})",
68                column, actual, expected
69            ),
70        }
71    }
72}
73
74impl std::error::Error for DataFrameError {}
75// endregion
76
77// region: DataFrameView
78
79/// A validated R `data.frame` backed by [`NamedList`] for O(1) column access.
80///
81/// This type wraps an existing R data.frame SEXP and provides:
82/// - O(1) column access by name via the [`NamedList`] index
83/// - O(1) column access by position
84/// - Schema validation via [`TypedListSpec`]
85/// - Conversion back to [`NamedList`] or raw SEXP
86///
87/// # Construction
88///
89/// Use [`DataFrameView::from_sexp`] to wrap an existing R data.frame, or
90/// [`NamedList::as_data_frame`] / [`List::as_data_frame`] to promote a list.
91///
92/// # Relationship to `DataFrame<T>`
93///
94/// [`DataFrame<T>`](crate::convert::DataFrame) is for *creating* data frames from
95/// row-oriented Rust data (implements `IntoDataFrame`). `DataFrameView` is for
96/// *receiving* data frames from R and inspecting their contents.
97pub struct DataFrameView {
98    inner: NamedList,
99    nrow: usize,
100}
101
102impl DataFrameView {
103    /// Wrap an existing R `data.frame` SEXP.
104    ///
105    /// Validates that the object:
106    /// 1. Is a VECSXP (list)
107    /// 2. Inherits from `"data.frame"`
108    /// 3. Has a `names` attribute
109    /// 4. Has extractable `row.names` for nrow
110    ///
111    /// # Errors
112    ///
113    /// Returns [`DataFrameError`] if validation fails.
114    pub fn from_sexp(sexp: SEXP) -> Result<Self, DataFrameError> {
115        // 1. Check it's a list (VECSXP)
116        let stype = sexp.type_of();
117        if stype != SEXPTYPE::VECSXP {
118            return Err(DataFrameError::NotList(format!(
119                "expected VECSXP, got {:?}",
120                stype
121            )));
122        }
123
124        // 2. Check it inherits from data.frame
125        let inherits = sexp.is_data_frame();
126        if !inherits {
127            return Err(DataFrameError::NotDataFrame);
128        }
129
130        // 3. Build NamedList (requires names attribute)
131        let list = unsafe { List::from_raw(sexp) };
132        let inner = NamedList::new(list).ok_or(DataFrameError::NoNames)?;
133
134        // 4. Extract nrow from row.names
135        let nrow = extract_nrow(sexp)?;
136
137        Ok(Self { inner, nrow })
138    }
139
140    /// Construct an `DataFrameView` from a [`NamedList`] and a pre-validated nrow.
141    ///
142    /// This is used internally by [`NamedList::as_data_frame`] after validation.
143    fn from_named_list(inner: NamedList, nrow: usize) -> Self {
144        Self { inner, nrow }
145    }
146
147    // region: Column access
148
149    /// Get a column by name, converting each element to type `T`.
150    ///
151    /// Uses O(1) name lookup via the [`NamedList`] index.
152    ///
153    /// Returns `None` if the column name is not found or conversion fails.
154    ///
155    /// # Example
156    ///
157    /// ```ignore
158    /// let x: Option<Vec<f64>> = df.column("x");
159    /// ```
160    #[inline]
161    pub fn column<T>(&self, name: &str) -> Option<T>
162    where
163        T: TryFromSexp<Error = SexpError>,
164    {
165        self.inner.get(name)
166    }
167
168    /// Get a column by 0-based index, converting to type `T`.
169    ///
170    /// Returns `None` if the index is out of bounds or conversion fails.
171    #[inline]
172    pub fn column_index<T>(&self, idx: usize) -> Option<T>
173    where
174        T: TryFromSexp<Error = SexpError>,
175    {
176        let idx_isize: isize = idx.try_into().ok()?;
177        self.inner.get_index(idx_isize)
178    }
179
180    /// Get the raw SEXP for a column by name.
181    ///
182    /// Returns `None` if the column name is not found.
183    #[inline]
184    pub fn column_raw(&self, name: &str) -> Option<SEXP> {
185        self.inner.get_raw(name)
186    }
187    // endregion
188
189    // region: Accessors
190
191    /// Number of rows.
192    #[inline]
193    pub fn nrow(&self) -> usize {
194        self.nrow
195    }
196
197    /// Number of columns (number of named elements in the list).
198    #[inline]
199    pub fn ncol(&self) -> usize {
200        self.inner.named_len()
201    }
202
203    /// Iterate over column names (unordered, from the HashMap index).
204    pub fn names(&self) -> impl Iterator<Item = &str> {
205        self.inner.names()
206    }
207
208    /// Check if a column name exists.
209    #[inline]
210    pub fn contains_column(&self, name: &str) -> bool {
211        self.inner.contains(name)
212    }
213    // endregion
214
215    // region: Validation
216
217    /// Validate the data frame's column types against a [`TypedListSpec`].
218    ///
219    /// This bridges the [`typed_list`](crate::typed_list!) validation infrastructure
220    /// to data frames, allowing schema checks like:
221    ///
222    /// ```ignore
223    /// let spec = TypedListSpec::new(vec![
224    ///     TypedEntry::required("x", TypeSpec::Numeric(None)),
225    ///     TypedEntry::required("y", TypeSpec::Integer(None)),
226    /// ]);
227    /// df.validate(&spec)?;
228    /// ```
229    pub fn validate(&self, spec: &TypedListSpec) -> Result<TypedList, TypedListError> {
230        validate_list(self.inner.as_list(), spec)
231    }
232    // endregion
233
234    // region: Conversions
235
236    /// Convert to the underlying [`NamedList`], consuming the data frame wrapper.
237    ///
238    /// The SEXP retains its `data.frame` class attribute.
239    #[inline]
240    pub fn into_named_list(self) -> NamedList {
241        self.inner
242    }
243
244    /// Get the underlying [`List`].
245    #[inline]
246    pub fn as_list(&self) -> List {
247        self.inner.as_list()
248    }
249
250    /// Get the underlying SEXP.
251    #[inline]
252    pub fn as_sexp(&self) -> SEXP {
253        self.inner.as_list().as_sexp()
254    }
255    // endregion
256}
257// endregion
258
259// region: TryFromSexp for DataFrameView
260
261impl TryFromSexp for DataFrameView {
262    type Error = SexpError;
263
264    fn try_from_sexp(sexp: SEXP) -> Result<Self, Self::Error> {
265        DataFrameView::from_sexp(sexp).map_err(|e| SexpError::InvalidValue(e.to_string()))
266    }
267}
268// endregion
269
270// region: IntoR for DataFrameView — returns the backing SEXP unchanged
271
272impl IntoR for DataFrameView {
273    type Error = std::convert::Infallible;
274    fn try_into_sexp(self) -> Result<SEXP, Self::Error> {
275        Ok(self.into_sexp())
276    }
277    unsafe fn try_into_sexp_unchecked(self) -> Result<SEXP, Self::Error> {
278        self.try_into_sexp()
279    }
280    #[inline]
281    fn into_sexp(self) -> SEXP {
282        self.inner.as_list().as_sexp()
283    }
284}
285// endregion
286
287// region: nrow extraction from row.names
288
289/// Extract `nrow` from R's `row.names` attribute.
290///
291/// R data frames store row.names in two forms:
292/// 1. **Compact integer form**: `c(NA_integer_, -n)` — meaning `1:n` row names
293/// 2. **Explicit form**: A character or integer vector of length n
294///
295/// If no row.names attribute exists, we fall back to the length of the first column.
296fn extract_nrow(sexp: SEXP) -> Result<usize, DataFrameError> {
297    let row_names = sexp.get_row_names();
298
299    if row_names.is_nil() {
300        // No row.names — fall back to first column length
301        return nrow_from_first_column(sexp);
302    }
303
304    let rn_type = row_names.type_of();
305    let rn_len = unsafe { ffi::Rf_xlength(row_names) };
306
307    // Compact integer form: c(NA_integer_, -n) where n is the row count
308    if rn_type == SEXPTYPE::INTSXP && rn_len == 2 {
309        let rn: &[i32] = unsafe { row_names.as_slice() };
310
311        // NA_INTEGER = i32::MIN, second is -nrow
312        if rn[0] == i32::MIN && rn[1] < 0 {
313            return Ok((-rn[1]) as usize);
314        }
315        // Not compact form, but an actual 2-element integer vector — nrow = 2
316        // (This is unusual but valid)
317    }
318
319    // Explicit form: nrow = length of row.names vector
320    if let Ok(n) = usize::try_from(rn_len) {
321        Ok(n)
322    } else {
323        Err(DataFrameError::BadRowNames(format!(
324            "row.names has negative length: {}",
325            rn_len
326        )))
327    }
328}
329
330/// Fall back: extract nrow from the length of the first column.
331fn nrow_from_first_column(sexp: SEXP) -> Result<usize, DataFrameError> {
332    let ncol = unsafe { ffi::Rf_xlength(sexp) };
333    if ncol == 0 {
334        // 0 columns → 0 rows
335        return Ok(0);
336    }
337    let first_col = sexp.vector_elt(0);
338    if first_col == SEXP::nil() {
339        return Ok(0);
340    }
341    let len = unsafe { ffi::Rf_xlength(first_col) };
342    if let Ok(n) = usize::try_from(len) {
343        Ok(n)
344    } else {
345        Err(DataFrameError::BadRowNames(
346            "first column has negative length".to_string(),
347        ))
348    }
349}
350// endregion
351
352// region: NamedList → DataFrameView promotion
353
354/// Validate that all columns in a NamedList have equal length, returning the common length.
355fn validate_equal_lengths(named: &NamedList) -> Result<usize, DataFrameError> {
356    let list = named.as_list();
357    let n = list.len();
358
359    if n == 0 {
360        return Ok(0);
361    }
362
363    // Get the length of the first column
364    let first_col = list.as_sexp().vector_elt(0);
365    let expected: usize = unsafe { ffi::Rf_xlength(first_col) }
366        .try_into()
367        .expect("column length must be non-negative");
368
369    // Check all columns match
370    let names_sexp = list.names();
371    for i in 1..n {
372        let col = list.as_sexp().vector_elt(i);
373        let col_len: usize = unsafe { ffi::Rf_xlength(col) }
374            .try_into()
375            .expect("column length must be non-negative");
376        if col_len != expected {
377            // Try to get the column name for the error message
378            let col_name = if let Some(names) = names_sexp {
379                let name_sexp = names.string_elt(i);
380                if name_sexp != SEXP::na_string() {
381                    let name_ptr = name_sexp.r_char();
382                    let name_cstr = unsafe { CStr::from_ptr(name_ptr) };
383                    name_cstr.to_str().unwrap_or("<invalid>").to_string()
384                } else {
385                    format!("column {}", i)
386                }
387            } else {
388                format!("column {}", i)
389            };
390
391            return Err(DataFrameError::UnequalLengths {
392                expected,
393                column: col_name,
394                actual: col_len,
395            });
396        }
397    }
398
399    Ok(expected)
400}
401
402impl NamedList {
403    /// Promote this named list to a [`DataFrameView`].
404    ///
405    /// Validates that all columns have equal length, then sets the `class`
406    /// attribute to `"data.frame"` and adds compact integer `row.names`.
407    ///
408    /// # Errors
409    ///
410    /// Returns [`DataFrameError::UnequalLengths`] if columns differ in length.
411    pub fn as_data_frame(&self) -> Result<DataFrameView, DataFrameError> {
412        let nrow = validate_equal_lengths(self)?;
413
414        // Set class attribute to "data.frame"
415        self.as_list().set_data_frame_class();
416
417        // Set compact row.names: c(NA_integer_, -nrow)
418        self.as_list().set_row_names_int(nrow);
419
420        // Clone the NamedList (List is Copy, we rebuild the HashMap index)
421        let inner = NamedList::new(self.as_list())
422            .expect("NamedList already has names; promotion should not lose them");
423
424        Ok(DataFrameView::from_named_list(inner, nrow))
425    }
426}
427
428impl List {
429    /// Promote this named list to a [`DataFrameView`].
430    ///
431    /// The list must have a `names` attribute and all columns must have equal length.
432    ///
433    /// # Errors
434    ///
435    /// Returns [`DataFrameError`] if the list has no names or columns differ in length.
436    pub fn as_data_frame(&self) -> Result<DataFrameView, DataFrameError> {
437        let named = NamedList::new(*self).ok_or(DataFrameError::NoNames)?;
438        named.as_data_frame()
439    }
440}
441// endregion
442
443// region: Debug impl
444
445impl std::fmt::Debug for DataFrameView {
446    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
447        f.debug_struct("DataFrameView")
448            .field("nrow", &self.nrow)
449            .field("ncol", &self.ncol())
450            .finish()
451    }
452}
453
454#[cfg(test)]
455mod tests {
456    use super::*;
457
458    #[test]
459    fn test_data_frame_error_display() {
460        let err = DataFrameError::NotDataFrame;
461        assert_eq!(err.to_string(), "object does not inherit from data.frame");
462
463        let err = DataFrameError::NoNames;
464        assert_eq!(err.to_string(), "data.frame has no column names");
465
466        let err = DataFrameError::UnequalLengths {
467            expected: 3,
468            column: "y".to_string(),
469            actual: 5,
470        };
471        assert_eq!(err.to_string(), "column \"y\" has length 5 (expected 3)");
472    }
473}
474// endregion