Skip to main content

miniextendr_api/
encoding.rs

1//! Encoding / locale probing utilities.
2//!
3//! This module exists mainly for debugging + experiments around R's string
4//! encodings. R's runtime has both:
5//! - per-CHARSXP encoding tags (UTF-8 / Latin-1 / bytes / native)
6//! - global/locale-level settings (native encoding, UTF-8 locale flags)
7//!
8//! # Availability
9//!
10//! The global signals are **non-API** (from `Defn.h`) and require the `nonapi` feature.
11//! Additionally, these symbols are **not exported from R's shared library**, so
12//! `miniextendr_encoding_init()` only works when:
13//! - Embedding R via `miniextendr-engine` (which links directly to R internals)
14//! - Running on platforms where these symbols happen to be exported
15//!
16//! For R packages (loaded via `.Call`), these symbols are typically unavailable,
17//! so `miniextendr_encoding_init()` is **disabled by default** in the entrypoint.
18//! The module is still useful for standalone Rust applications embedding R.
19
20use std::sync::OnceLock;
21
22/// Cached snapshot of R's encoding / locale state at init time.
23#[derive(Debug, Clone)]
24pub struct REncodingInfo {
25    #[cfg(feature = "nonapi")]
26    /// R's reported native encoding (non-API).
27    pub native_encoding: Option<String>,
28    #[cfg(feature = "nonapi")]
29    /// Whether R thinks the current locale is UTF-8 (non-API).
30    pub utf8_locale: Option<bool>,
31    #[cfg(feature = "nonapi")]
32    /// Whether R thinks the current locale is Latin-1 (non-API).
33    pub latin1_locale: Option<bool>,
34    #[cfg(feature = "nonapi")]
35    /// Whether R has determined it's "known to be UTF-8" (non-API).
36    pub known_to_be_utf8: Option<bool>,
37}
38
39static ENCODING_INFO: OnceLock<REncodingInfo> = OnceLock::new();
40
41/// Return the cached encoding info (if `miniextendr_encoding_init()` has run).
42#[inline]
43pub fn encoding_info() -> Option<&'static REncodingInfo> {
44    ENCODING_INFO.get()
45}
46
47/// Assert that R's locale is UTF-8.
48///
49/// Called once from `R_init_*` (package init). Errors if the R session
50/// does not use UTF-8, since `charsxp_to_str` assumes all CHARSXP bytes
51/// are valid UTF-8.
52///
53/// Uses `l10n_info()[["UTF-8"]]` which is public R API.
54#[unsafe(no_mangle)]
55pub extern "C" fn miniextendr_assert_utf8_locale() {
56    debug_assert!(
57        crate::worker::is_r_main_thread(),
58        "must be called from R main thread"
59    );
60    use crate::ffi::{
61        LOGICAL, R_BaseEnv, Rf_eval, Rf_install, Rf_protect, Rf_unprotect, Rf_xlength, SexpExt,
62    };
63
64    unsafe {
65        // Call l10n_info()
66        let call = crate::ffi::Rf_lang1(Rf_install(c"l10n_info".as_ptr()));
67        Rf_protect(call);
68        let info = Rf_eval(call, R_BaseEnv);
69        Rf_protect(info);
70
71        // Find the "UTF-8" element by name
72        let names = info.get_names();
73        let n = Rf_xlength(info);
74        let mut is_utf8 = false;
75        for i in 0..n {
76            let name_charsxp = names.string_elt(i);
77            let name_ptr = name_charsxp.r_char();
78            let name = std::ffi::CStr::from_ptr(name_ptr);
79            if name == c"UTF-8" {
80                let elt = info.vector_elt(i);
81                is_utf8 = LOGICAL(elt).read() != 0;
82                break;
83            }
84        }
85
86        Rf_unprotect(2);
87
88        if !is_utf8 {
89            crate::ffi::Rf_error_unchecked(
90                c"%s".as_ptr(),
91                c"miniextendr requires a UTF-8 locale (R >= 4.2.0 uses UTF-8 by default)".as_ptr(),
92            );
93        }
94    }
95}
96
97/// Initialize / snapshot R's encoding state.
98///
99/// Intended to be called once from `R_init_*` (package init).
100#[unsafe(no_mangle)]
101pub extern "C-unwind" fn miniextendr_encoding_init() {
102    let _ = ENCODING_INFO.get_or_init(|| {
103        #[cfg(feature = "nonapi")]
104        unsafe {
105            use crate::ffi::{Rboolean, nonapi_encoding};
106
107            let native_encoding = {
108                let ptr = nonapi_encoding::R_nativeEncoding();
109                if ptr.is_null() {
110                    None
111                } else {
112                    Some(std::ffi::CStr::from_ptr(ptr).to_string_lossy().into_owned())
113                }
114            };
115
116            let utf8_locale = Some(nonapi_encoding::utf8locale != Rboolean::FALSE);
117            let latin1_locale = Some(nonapi_encoding::latin1locale != Rboolean::FALSE);
118            let known_to_be_utf8 = Some(nonapi_encoding::known_to_be_utf8 != Rboolean::FALSE);
119
120            let info = REncodingInfo {
121                native_encoding,
122                utf8_locale,
123                latin1_locale,
124                known_to_be_utf8,
125            };
126
127            if std::env::var_os("MINIEXTENDR_ENCODING_DEBUG").is_some() {
128                let msg = format!("[miniextendr] encoding init: {info:?}\n");
129                if let Ok(c_msg) = std::ffi::CString::new(msg) {
130                    crate::ffi::REprintf_unchecked(c"%s".as_ptr(), c_msg.as_ptr());
131                }
132            }
133
134            info
135        }
136
137        #[cfg(not(feature = "nonapi"))]
138        {
139            REncodingInfo {}
140        }
141    });
142}