Skip to main content

bge_m3_embedding_server/
sysinfo.rs

1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Memory detection for auto-budget computation.
16//!
17//! Production target is Linux (Fargate/ECS). On Linux we walk the cgroup
18//! hierarchy to find the container memory limit, then fall back to host RAM
19//! reported by `/proc/meminfo`. On macOS we read host RAM via `sysctl`;
20//! cgroup support requires unsafe FFI so it is deferred.
21//!
22//! ## cgroup-v2 detection on ECS Managed Instances (Bottlerocket)
23//!
24//! ECS Managed Instances launch containers **without** `--cgroupns=private`,
25//! so `/sys/fs/cgroup/memory.max` resolves to the unified-hierarchy root,
26//! which reads `"max"` (no limit). The actual container memory limit is
27//! set at a deeper path whose last component is recorded in
28//! `/proc/self/cgroup` (unified-hierarchy format: a single line
29//! `0::<path>`, e.g. `0::/ecs.slice/ecs-…-task.scope/<id>`).
30//!
31//! `cgroup_memory()` reads `/proc/self/cgroup`, extracts that path, then
32//! reads `memory.max` at each ancestor (deepest first) until it finds a
33//! numeric limit or exhausts the tree. Falls through to `host_ram` only
34//! when the entire walk yields `"max"` (truly unconstrained host).
35//!
36//! RSS tracking (`read_process_rss_bytes`) is Linux-only (parses
37//! `/proc/self/statm`). On macOS it returns `None`; the auto-budget logic
38//! treats `None` as "cannot measure model footprint" and uses conservative
39//! defaults.
40use tracing::warn;
41
42// ---------------------------------------------------------------------------
43// Public types
44// ---------------------------------------------------------------------------
45
46/// Where the available-memory reading came from.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48#[allow(dead_code)] // CgroupV2/CgroupV1 are constructed only on Linux; macOS sees them as unused.
49pub enum MemorySource {
50    /// `BGE_M3_AVAILABLE_MEMORY_BYTES` env override.
51    Override,
52    /// Linux cgroup v2 `memory.max`.
53    CgroupV2,
54    /// Linux cgroup v1 `memory.limit_in_bytes`.
55    CgroupV1,
56    /// `/proc/meminfo` `MemAvailable` (Linux) or `sysctl hw.memsize` (macOS).
57    HostRam,
58}
59
60impl std::fmt::Display for MemorySource {
61    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62        match self {
63            Self::Override => f.write_str("override"),
64            Self::CgroupV2 => f.write_str("cgroup_v2"),
65            Self::CgroupV1 => f.write_str("cgroup_v1"),
66            Self::HostRam => f.write_str("host_ram"),
67        }
68    }
69}
70
71/// A memory reading with its provenance.
72#[derive(Debug, Clone, Copy)]
73pub struct MemoryReading {
74    /// Total available memory bytes detected from the source.
75    pub available_bytes: usize,
76    /// Detection method that produced this reading.
77    pub source: MemorySource,
78}
79
80// ---------------------------------------------------------------------------
81// Public API
82// ---------------------------------------------------------------------------
83
84/// Detects available memory for the process.
85///
86/// Detection chain (first success wins):
87/// 1. `BGE_M3_AVAILABLE_MEMORY_BYTES` env override.
88/// 2. Linux cgroup v2: `/sys/fs/cgroup/memory.max`.
89/// 3. Linux cgroup v1: `/sys/fs/cgroup/memory/memory.limit_in_bytes`.
90/// 4. Linux: `/proc/meminfo` `MemAvailable`.
91/// 5. macOS: `sysctl hw.memsize` (total host RAM; no cgroup support).
92/// 6. Fallback: 4 GiB constant with a warning log.
93pub(crate) fn detect_available_memory() -> MemoryReading {
94    // --- step 1: explicit override ---
95    if let Some(bytes) = env_override() {
96        return MemoryReading {
97            available_bytes: bytes,
98            source: MemorySource::Override,
99        };
100    }
101
102    // --- step 2 / 3: Linux cgroup ---
103    #[cfg(target_os = "linux")]
104    if let Some(r) = cgroup_memory() {
105        return r;
106    }
107
108    // --- step 4 / 5: OS-level RAM ---
109    if let Some(bytes) = host_ram() {
110        return MemoryReading {
111            available_bytes: bytes,
112            source: MemorySource::HostRam,
113        };
114    }
115
116    // --- fallback ---
117    let fallback: usize = 4 * 1024 * 1024 * 1024; // 4 GiB
118    warn!(
119        available_bytes = fallback,
120        "Memory detection failed on all paths; using 4 GiB fallback. \
121         Set BGE_M3_AVAILABLE_MEMORY_BYTES to override."
122    );
123    MemoryReading {
124        available_bytes: fallback,
125        source: MemorySource::HostRam,
126    }
127}
128
129/// Returns the current process's RSS (Resident Set Size) in bytes, or `None`
130/// if measurement is not supported on this platform.
131///
132/// Linux: parses `/proc/self/statm`. Field 1 (index 1) is RSS in pages;
133/// multiplied by the system page size (typically 4096).
134///
135/// macOS: returns `None` — requires `task_info` FFI which conflicts with
136/// `unsafe_code = "forbid"`. A future release can add it via the `mach2`
137/// crate.
138pub(crate) fn read_process_rss_bytes() -> Option<usize> {
139    #[cfg(target_os = "linux")]
140    return linux_rss();
141
142    #[cfg(not(target_os = "linux"))]
143    None
144}
145
146// ---------------------------------------------------------------------------
147// Private helpers
148// ---------------------------------------------------------------------------
149
150fn env_override() -> Option<usize> {
151    std::env::var("BGE_M3_AVAILABLE_MEMORY_BYTES")
152        .ok()
153        .and_then(|v| {
154            v.parse::<usize>().ok().or_else(|| {
155                warn!(
156                    value = %v,
157                    "BGE_M3_AVAILABLE_MEMORY_BYTES is not a valid usize; ignoring"
158                );
159                None
160            })
161        })
162}
163
164#[cfg(target_os = "linux")]
165fn cgroup_memory() -> Option<MemoryReading> {
166    // Sentinel threshold: the cgroup v1 kernel uses a near-i64::MAX value when
167    // no limit is configured. Treat any value ≥ 1 TiB as "unlimited".
168    const ONE_TIB: usize = 1024 * 1024 * 1024 * 1024;
169
170    // --- cgroup v2: path-walk from /proc/self/cgroup ---
171    //
172    // ECS Managed Instances (Bottlerocket) do NOT set --cgroupns=private, so
173    // /sys/fs/cgroup/memory.max resolves to the host root where value is "max".
174    // The container's actual limit lives at a deeper path recorded in
175    // /proc/self/cgroup (unified v2 format: `0::<path>`).
176    //
177    // Walk ancestors deepest-first until a numeric limit < 1 TiB is found.
178    // If the entire walk yields "max", fall through to cgroup v1 then host_ram.
179    if let Some(reading) = cgroup_v2_walk("/sys/fs/cgroup", "/proc/self/cgroup") {
180        return Some(reading);
181    }
182
183    // --- cgroup v1: /sys/fs/cgroup/memory/memory.limit_in_bytes ---
184    if let Ok(raw) = std::fs::read_to_string("/sys/fs/cgroup/memory/memory.limit_in_bytes") {
185        let trimmed = raw.trim();
186        if let Ok(bytes) = trimmed.parse::<usize>() {
187            if bytes < ONE_TIB {
188                tracing::debug!(bytes, source = "cgroup_v1", "Detected memory limit");
189                return Some(MemoryReading {
190                    available_bytes: bytes,
191                    source: MemorySource::CgroupV1,
192                });
193            }
194        }
195    }
196
197    None
198}
199
200/// Reads the cgroup v2 memory limit by walking ancestors of the container's
201/// cgroup path.
202///
203/// # Arguments
204///
205/// - `cgroup_fs_root`: the mountpoint of the cgroup v2 filesystem (normally
206///   `/sys/fs/cgroup`; injectable for unit tests).
207/// - `proc_self_cgroup`: path to the per-process cgroup file (normally
208///   `/proc/self/cgroup`; injectable for unit tests).
209///
210/// Parses the unified-hierarchy line (`0::<path>`), then iterates from the
211/// deepest ancestor up to the root, reading `memory.max` at each level.
212/// Returns the first numeric limit found that is below 1 TiB, or `None`
213/// when the entire walk yields `"max"` or the file is unreadable.
214#[cfg(target_os = "linux")]
215pub(crate) fn cgroup_v2_walk(
216    cgroup_fs_root: &str,
217    proc_self_cgroup: &str,
218) -> Option<MemoryReading> {
219    const ONE_TIB: usize = 1024 * 1024 * 1024 * 1024;
220
221    let cgroup_content = std::fs::read_to_string(proc_self_cgroup).ok()?;
222
223    // Unified hierarchy: exactly one line, format `0::<path>` (e.g. `0::/ecs.slice/…`)
224    // Legacy v1 has multiple lines, each with `<hierarchy_id>:<controllers>:<path>`.
225    // We only attempt v2 if we find the unified `0::` prefix.
226    let cgroup_rel_path = cgroup_content
227        .lines()
228        .find_map(|line| line.strip_prefix("0::"))?;
229
230    // Build the absolute cgroup directory path.
231    let cgroup_dir = std::path::PathBuf::from(cgroup_fs_root).join(
232        // Strip the leading '/' so PathBuf::join doesn't replace the root.
233        cgroup_rel_path.trim_start_matches('/'),
234    );
235
236    // Walk ancestors from deepest to shallowest (inclusive of the container
237    // cgroup itself, exclusive of the root mountpoint).
238    let mut current = cgroup_dir.as_path();
239    let fs_root = std::path::Path::new(cgroup_fs_root);
240
241    loop {
242        let memory_max = current.join("memory.max");
243        if let Ok(raw) = std::fs::read_to_string(&memory_max) {
244            let trimmed = raw.trim();
245            if trimmed != "max" {
246                if let Ok(bytes) = trimmed.parse::<usize>() {
247                    if bytes < ONE_TIB {
248                        tracing::debug!(
249                            bytes,
250                            source = "cgroup_v2",
251                            path = %memory_max.display(),
252                            "Detected memory limit"
253                        );
254                        return Some(MemoryReading {
255                            available_bytes: bytes,
256                            source: MemorySource::CgroupV2,
257                        });
258                    }
259                }
260            }
261        }
262
263        // Stop at the cgroup filesystem root — don't walk above it.
264        if current == fs_root {
265            break;
266        }
267
268        match current.parent() {
269            Some(parent) => current = parent,
270            None => break,
271        }
272    }
273
274    None
275}
276
277/// Linux: parse `MemAvailable` from `/proc/meminfo` (kB → bytes).
278/// macOS: read total host RAM via `sysctl hw.memsize`.
279fn host_ram() -> Option<usize> {
280    #[cfg(target_os = "linux")]
281    return linux_meminfo_available();
282
283    #[cfg(target_os = "macos")]
284    return macos_host_ram();
285
286    #[cfg(not(any(target_os = "linux", target_os = "macos")))]
287    None
288}
289
290#[cfg(target_os = "linux")]
291fn linux_meminfo_available() -> Option<usize> {
292    let content = std::fs::read_to_string("/proc/meminfo").ok()?;
293    for line in content.lines() {
294        if line.starts_with("MemAvailable:") {
295            let kb: usize = line.split_whitespace().nth(1)?.parse().ok()?;
296            return Some(kb * 1024);
297        }
298    }
299    None
300}
301
302#[cfg(target_os = "macos")]
303fn macos_host_ram() -> Option<usize> {
304    // `sysctl -n hw.memsize` returns an integer in bytes printed to stdout.
305    let output = std::process::Command::new("sysctl")
306        .args(["-n", "hw.memsize"])
307        .output()
308        .ok()?;
309    let stdout = std::str::from_utf8(&output.stdout).ok()?.trim();
310    stdout.parse::<usize>().ok()
311}
312
313#[cfg(target_os = "linux")]
314fn linux_rss() -> Option<usize> {
315    // /proc/self/statm: all values in pages.
316    // Fields: size, rss, shared, text, lib, data, dt
317    let raw = std::fs::read_to_string("/proc/self/statm").ok()?;
318    let rss_pages: usize = raw.split_whitespace().nth(1)?.parse().ok()?;
319    // SAFETY: page_size is a compile-time constant on Linux (4096 on x86_64/arm64).
320    // We use sysconf(SC_PAGESIZE) via libc-free approach: fallback to 4096.
321    let page_size = page_size_bytes();
322    Some(rss_pages * page_size)
323}
324
325#[cfg(target_os = "linux")]
326fn page_size_bytes() -> usize {
327    // Read from /proc/self/auxv would be ideal but requires parsing ELF aux
328    // vectors. Parsing /proc/$pid/smaps is too heavy. sysconf(SC_PAGESIZE)
329    // requires libc. The practical answer on Linux/x86_64 and Linux/aarch64
330    // is always 4096; we hard-code that to avoid any unsafe.
331    4096
332}
333
334#[cfg(test)]
335mod tests;