Skip to main content

miniextendr_api/
r_memory.rs

1//! Utilities for recovering R SEXPs from raw data pointers.
2//!
3//! R stores vector data at a fixed offset after the SEXPREC header. Given a
4//! pointer into that data region, we can subtract the header size to recover
5//! the SEXP — then verify it by reading raw memory fields (type tag, ALTREP
6//! bit, and vecsxp.length) without calling any R functions.
7//!
8//! This is used by:
9//! - Arrow integration: zero-copy IntoR when the buffer is R-backed
10//! - `Cow<[T]>` IntoR round-trip
11//!
12//! # Initialization
13//!
14//! [`init_sexprec_data_offset`] must be called during package init (before any
15//! recovery attempts). It measures the offset on a real R vector, so it works
16//! across R versions and platforms.
17//!
18//! # R's VECTOR_SEXPREC layout
19//!
20//! ```text
21//! // From R's Defn.h:
22//! typedef struct VECTOR_SEXPREC {
23//!     SEXPREC_HEADER;           // sxpinfo(8) + attrib(8) + gengc_next(8) + gengc_prev(8)
24//!     struct vecsxp_struct {    // length(8) + truelength(8)
25//!         R_xlen_t length;
26//!         R_xlen_t truelength;
27//!     } vecsxp;
28//! } VECTOR_SEXPREC;
29//!
30//! typedef union { VECTOR_SEXPREC s; double align; } SEXPREC_ALIGN;
31//! #define STDVEC_DATAPTR(x) ((void *)(((SEXPREC_ALIGN *)(x)) + 1))
32//! ```
33//!
34//! On 64-bit: `sizeof(VECTOR_SEXPREC)` = 48 bytes, `sizeof(SEXPREC_ALIGN)` = 48.
35//! Data starts at `sexp + 48`. All vector types (REALSXP, INTSXP, RAWSXP,
36//! STRSXP, VECSXP) use the same `VECTOR_SEXPREC` header.
37//!
38//! # Why not `#[repr(C)]` mirror struct?
39//!
40//! A Rust `#[repr(C)]` struct mirroring `VECTOR_SEXPREC` would give a
41//! compile-time `size_of` instead of runtime measurement. However:
42//! - R's layout can vary by version and compile options (32-bit, padding)
43//! - The runtime measurement is one allocation at init — negligible
44//! - A `repr(C)` mirror struct doesn't help with the real safety issue:
45//!   reading from a speculative pointer. `addr_of!` computes field addresses
46//!   without dereferencing, but we still need to `read()` the type tag — and
47//!   that read is from potentially invalid memory for non-R pointers.
48//!
49//! The verification (type tag + ALTREP check + XLENGTH) prevents false
50//! positives. Only the type tag requires a raw sxpinfo read; ALTREP and
51//! XLENGTH use R's public C API.
52//!
53//! # Safety of speculative reads
54//!
55//! The candidate pointer is computed from pointer arithmetic on the input
56//! data_ptr. For Rust-owned buffers (not R-backed), this points into
57//! arbitrary heap memory. We must be careful about which R functions we
58//! call on it:
59//!
60//! - **`ALTREP(x)`** — safe: just reads `x->sxpinfo.alt` (a single bit).
61//! - **`XLENGTH(x)`** on non-ALTREP — safe: reads `STDVEC_LENGTH` (struct
62//!   field, no dispatch, no error).
63//! - **`LENGTH(x)`** — UNSAFE: wraps XLENGTH with `> INT_MAX` check that
64//!   calls `R_BadLongVector()` (throws R error on garbage with large length).
65//! - **`DATAPTR_RO(x)`** — UNSAFE on ALTREP: dispatches through class vtable
66//!   (bogus function pointers on garbage). On non-ALTREP: `STDVEC_DATAPTR`
67//!   which also checks for long vectors.
68//!
69//! The verification sequence is:
70//! 1. Raw sxpinfo type tag (bits 0-4) — no public TYPEOF that's safe on garbage
71//! 2. `ALTREP(candidate)` — gates step 3 (rejects ALTREP before XLENGTH dispatch)
72//! 3. `XLENGTH(candidate)` — safe for non-ALTREP (STDVEC_LENGTH, no errors)
73
74use std::sync::atomic::{AtomicUsize, Ordering};
75
76use crate::ffi::{self, SEXP, SEXPTYPE, SexpExt};
77
78/// Offset in bytes from SEXP address to data pointer for standard (non-ALTREP) vectors.
79///
80/// `DATAPTR_RO(sexp) == (sexp as *const u8).add(SEXPREC_DATA_OFFSET)`
81///
82/// Zero means not yet initialized.
83static SEXPREC_DATA_OFFSET: AtomicUsize = AtomicUsize::new(0);
84
85/// Get the computed SEXPREC data offset.
86///
87/// Returns 0 if not yet initialized.
88#[inline]
89pub fn sexprec_data_offset() -> usize {
90    SEXPREC_DATA_OFFSET.load(Ordering::Relaxed)
91}
92
93/// Compute and store the SEXPREC data offset by measuring a real R vector.
94///
95/// Must be called from R's main thread during package init.
96///
97/// # Safety
98///
99/// Must be called on R's main thread with R initialized.
100pub unsafe fn init_sexprec_data_offset() {
101    unsafe {
102        let test = ffi::Rf_protect(ffi::Rf_allocVector(SEXPTYPE::REALSXP, 1));
103        let sexp_addr = test.0 as usize;
104        let data_addr = ffi::DATAPTR_RO(test) as usize;
105        SEXPREC_DATA_OFFSET.store(data_addr - sexp_addr, Ordering::Relaxed);
106        ffi::Rf_unprotect(1);
107    }
108}
109
110/// Try to recover the source R SEXP from a data pointer.
111///
112/// Given a pointer that may point into an R vector's data area, this
113/// subtracts the known SEXPREC header size to get a candidate SEXP, then
114/// verifies it:
115/// 1. The SEXP type tag (bits 0-4 of sxpinfo) matches `expected_type`
116/// 2. `ALTREP(candidate)` is false (only non-ALTREP vectors have fixed-offset data)
117/// 3. `XLENGTH(candidate)` matches `expected_len` (safe for non-ALTREP)
118///
119/// Returns `None` if:
120/// - The offset hasn't been initialized yet
121/// - The pointer doesn't come from an R vector
122/// - The candidate SEXP has the wrong type or length
123/// - The candidate is an ALTREP vector (data not at fixed offset from SEXP)
124///
125/// # Safety
126///
127/// Must be called on R's main thread. The data pointer must be valid
128/// (i.e., it must point to readable memory for at least `expected_len`
129/// elements, which is guaranteed if it came from an Arrow buffer).
130pub unsafe fn try_recover_r_sexp(
131    data_ptr: *const u8,
132    expected_type: SEXPTYPE,
133    expected_len: usize,
134) -> Option<SEXP> {
135    let offset = SEXPREC_DATA_OFFSET.load(Ordering::Relaxed);
136    if offset == 0 {
137        return None;
138    }
139
140    // Zero-length vectors can't be recovered (R uses sentinel pointer 0x1,
141    // and empty Arrow buffers use dangling pointers).
142    if expected_len == 0 {
143        return None;
144    }
145
146    let data_addr = data_ptr as usize;
147
148    // Reject pointers that would wraparound or are in invalid ranges.
149    // R's sentinel for empty vectors is 0x1; wrapping_byte_sub on small
150    // addresses produces huge values (top of address space) → segfault.
151    // The 4096 threshold also guards against speculative reads near page
152    // boundaries — for non-R pointers (e.g. Rust-allocated Arrow buffers),
153    // subtracting the offset could land before the start of mapped memory.
154    if data_addr < offset.saturating_add(4096) {
155        return None;
156    }
157
158    // Compute candidate SEXP by subtracting header size.
159    // wrapping_byte_sub is defined behavior for all pointer arithmetic.
160    let candidate_ptr = (data_ptr as *mut ffi::SEXPREC).wrapping_byte_sub(offset);
161
162    let candidate = SEXP(candidate_ptr);
163
164    // Quick check: type tag (bits 0-4 of sxpinfo, which is the first field).
165    // For Rust-allocated buffers this reads arbitrary heap memory, but
166    // wrapping_sub ensures the pointer arithmetic itself is defined.
167    // The read is a plain u32 load from mapped heap.
168    // No public R API reads TYPEOF without side effects on invalid pointers,
169    // so a raw sxpinfo read is unavoidable here.
170    let sxpinfo_bits = unsafe { *(candidate.0 as *const u32) };
171    let type_bits = sxpinfo_bits & 0x1f;
172    if type_bits != expected_type as u32 {
173        return None;
174    }
175
176    // Reject ALTREP via R's public API (SexpExt::is_altrep → ALTREP(x)).
177    // ALTREP vectors store data via indirection — can't recover them.
178    // This also gates the xlength() call below: for non-ALTREP, XLENGTH is
179    // just STDVEC_LENGTH (a struct field read, no dispatch, no error).
180    // For ALTREP, XLENGTH dispatches through the class vtable which would
181    // crash on a garbage pointer.
182    if candidate.is_altrep() {
183        return None;
184    }
185
186    // For non-ALTREP, xlength() → Rf_xlength → STDVEC_LENGTH — a direct
187    // struct field read with no dispatch and no "long vectors not supported"
188    // error. (LENGTH() wraps XLENGTH with an INT_MAX check that can
189    // R_BadLongVector; XLENGTH itself never errors for non-ALTREP vectors.)
190    if candidate.len() != expected_len {
191        return None;
192    }
193
194    // No DATAPTR_RO round-trip check needed: for non-ALTREP vectors,
195    // STDVEC_DATAPTR(x) == (char*)x + SEXPREC_DATA_OFFSET, and we
196    // constructed candidate = data_ptr - offset, so the round-trip
197    // is tautologically true. The type + ALTREP + length checks above
198    // are the actual discriminators.
199
200    Some(candidate)
201}