bge_m3_embedding_server/sysinfo.rs
1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Memory detection for auto-budget computation.
16//!
17//! Production target is Linux (Fargate/ECS). On Linux we walk the cgroup
18//! hierarchy to find the container memory limit, then fall back to host RAM
19//! reported by `/proc/meminfo`. On macOS we read host RAM via `sysctl`;
20//! cgroup support requires unsafe FFI so it is deferred.
21//!
22//! ## cgroup-v2 detection on ECS Managed Instances (Bottlerocket)
23//!
24//! ECS Managed Instances launch containers **without** `--cgroupns=private`,
25//! so `/sys/fs/cgroup/memory.max` resolves to the unified-hierarchy root,
26//! which reads `"max"` (no limit). The actual container memory limit is
27//! set at a deeper path whose last component is recorded in
28//! `/proc/self/cgroup` (unified-hierarchy format: a single line
29//! `0::<path>`, e.g. `0::/ecs.slice/ecs-…-task.scope/<id>`).
30//!
31//! `cgroup_memory()` reads `/proc/self/cgroup`, extracts that path, then
32//! reads `memory.max` at each ancestor (deepest first) until it finds a
33//! numeric limit or exhausts the tree. Falls through to `host_ram` only
34//! when the entire walk yields `"max"` (truly unconstrained host).
35//!
36//! RSS tracking (`read_process_rss_bytes`) is Linux-only (parses
37//! `/proc/self/statm`). On macOS it returns `None`; the auto-budget logic
38//! treats `None` as "cannot measure model footprint" and uses conservative
39//! defaults.
40use tracing::warn;
41
42// ---------------------------------------------------------------------------
43// Public types
44// ---------------------------------------------------------------------------
45
46/// Where the available-memory reading came from.
47#[derive(Debug, Clone, Copy, PartialEq, Eq)]
48#[allow(dead_code)] // CgroupV2/CgroupV1 are constructed only on Linux; macOS sees them as unused.
49pub enum MemorySource {
50 /// `BGE_M3_AVAILABLE_MEMORY_BYTES` env override.
51 Override,
52 /// Linux cgroup v2 `memory.max`.
53 CgroupV2,
54 /// Linux cgroup v1 `memory.limit_in_bytes`.
55 CgroupV1,
56 /// `/proc/meminfo` `MemAvailable` (Linux) or `sysctl hw.memsize` (macOS).
57 HostRam,
58}
59
60impl std::fmt::Display for MemorySource {
61 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
62 match self {
63 Self::Override => f.write_str("override"),
64 Self::CgroupV2 => f.write_str("cgroup_v2"),
65 Self::CgroupV1 => f.write_str("cgroup_v1"),
66 Self::HostRam => f.write_str("host_ram"),
67 }
68 }
69}
70
71/// A memory reading with its provenance.
72#[derive(Debug, Clone, Copy)]
73pub struct MemoryReading {
74 /// Total available memory bytes detected from the source.
75 pub available_bytes: usize,
76 /// Detection method that produced this reading.
77 pub source: MemorySource,
78}
79
80// ---------------------------------------------------------------------------
81// Public API
82// ---------------------------------------------------------------------------
83
84/// Detects available memory for the process.
85///
86/// Detection chain (first success wins):
87/// 1. `BGE_M3_AVAILABLE_MEMORY_BYTES` env override.
88/// 2. Linux cgroup v2: `/sys/fs/cgroup/memory.max`.
89/// 3. Linux cgroup v1: `/sys/fs/cgroup/memory/memory.limit_in_bytes`.
90/// 4. Linux: `/proc/meminfo` `MemAvailable`.
91/// 5. macOS: `sysctl hw.memsize` (total host RAM; no cgroup support).
92/// 6. Fallback: 4 GiB constant with a warning log.
93pub(crate) fn detect_available_memory() -> MemoryReading {
94 // --- step 1: explicit override ---
95 if let Some(bytes) = env_override() {
96 return MemoryReading {
97 available_bytes: bytes,
98 source: MemorySource::Override,
99 };
100 }
101
102 // --- step 2 / 3: Linux cgroup ---
103 #[cfg(target_os = "linux")]
104 if let Some(r) = cgroup_memory() {
105 return r;
106 }
107
108 // --- step 4 / 5: OS-level RAM ---
109 if let Some(bytes) = host_ram() {
110 return MemoryReading {
111 available_bytes: bytes,
112 source: MemorySource::HostRam,
113 };
114 }
115
116 // --- fallback ---
117 let fallback: usize = 4 * 1024 * 1024 * 1024; // 4 GiB
118 warn!(
119 available_bytes = fallback,
120 "Memory detection failed on all paths; using 4 GiB fallback. \
121 Set BGE_M3_AVAILABLE_MEMORY_BYTES to override."
122 );
123 MemoryReading {
124 available_bytes: fallback,
125 source: MemorySource::HostRam,
126 }
127}
128
129/// Returns the current process's RSS (Resident Set Size) in bytes, or `None`
130/// if measurement is not supported on this platform.
131///
132/// Linux: parses `/proc/self/statm`. Field 1 (index 1) is RSS in pages;
133/// multiplied by the system page size (typically 4096).
134///
135/// macOS: returns `None` — requires `task_info` FFI which conflicts with
136/// `unsafe_code = "forbid"`. A future release can add it via the `mach2`
137/// crate.
138pub(crate) fn read_process_rss_bytes() -> Option<usize> {
139 #[cfg(target_os = "linux")]
140 return linux_rss();
141
142 #[cfg(not(target_os = "linux"))]
143 None
144}
145
146// ---------------------------------------------------------------------------
147// Private helpers
148// ---------------------------------------------------------------------------
149
150fn env_override() -> Option<usize> {
151 std::env::var("BGE_M3_AVAILABLE_MEMORY_BYTES")
152 .ok()
153 .and_then(|v| {
154 v.parse::<usize>().ok().or_else(|| {
155 warn!(
156 value = %v,
157 "BGE_M3_AVAILABLE_MEMORY_BYTES is not a valid usize; ignoring"
158 );
159 None
160 })
161 })
162}
163
164#[cfg(target_os = "linux")]
165fn cgroup_memory() -> Option<MemoryReading> {
166 // Sentinel threshold: the cgroup v1 kernel uses a near-i64::MAX value when
167 // no limit is configured. Treat any value ≥ 1 TiB as "unlimited".
168 const ONE_TIB: usize = 1024 * 1024 * 1024 * 1024;
169
170 // --- cgroup v2: path-walk from /proc/self/cgroup ---
171 //
172 // ECS Managed Instances (Bottlerocket) do NOT set --cgroupns=private, so
173 // /sys/fs/cgroup/memory.max resolves to the host root where value is "max".
174 // The container's actual limit lives at a deeper path recorded in
175 // /proc/self/cgroup (unified v2 format: `0::<path>`).
176 //
177 // Walk ancestors deepest-first until a numeric limit < 1 TiB is found.
178 // If the entire walk yields "max", fall through to cgroup v1 then host_ram.
179 if let Some(reading) = cgroup_v2_walk("/sys/fs/cgroup", "/proc/self/cgroup") {
180 return Some(reading);
181 }
182
183 // --- cgroup v1: /sys/fs/cgroup/memory/memory.limit_in_bytes ---
184 if let Ok(raw) = std::fs::read_to_string("/sys/fs/cgroup/memory/memory.limit_in_bytes") {
185 let trimmed = raw.trim();
186 if let Ok(bytes) = trimmed.parse::<usize>() {
187 if bytes < ONE_TIB {
188 tracing::debug!(bytes, source = "cgroup_v1", "Detected memory limit");
189 return Some(MemoryReading {
190 available_bytes: bytes,
191 source: MemorySource::CgroupV1,
192 });
193 }
194 }
195 }
196
197 None
198}
199
200/// Reads the cgroup v2 memory limit by walking ancestors of the container's
201/// cgroup path.
202///
203/// # Arguments
204///
205/// - `cgroup_fs_root`: the mountpoint of the cgroup v2 filesystem (normally
206/// `/sys/fs/cgroup`; injectable for unit tests).
207/// - `proc_self_cgroup`: path to the per-process cgroup file (normally
208/// `/proc/self/cgroup`; injectable for unit tests).
209///
210/// Parses the unified-hierarchy line (`0::<path>`), then iterates from the
211/// deepest ancestor up to the root, reading `memory.max` at each level.
212/// Returns the first numeric limit found that is below 1 TiB, or `None`
213/// when the entire walk yields `"max"` or the file is unreadable.
214#[cfg(target_os = "linux")]
215pub(crate) fn cgroup_v2_walk(
216 cgroup_fs_root: &str,
217 proc_self_cgroup: &str,
218) -> Option<MemoryReading> {
219 const ONE_TIB: usize = 1024 * 1024 * 1024 * 1024;
220
221 let cgroup_content = std::fs::read_to_string(proc_self_cgroup).ok()?;
222
223 // Unified hierarchy: exactly one line, format `0::<path>` (e.g. `0::/ecs.slice/…`)
224 // Legacy v1 has multiple lines, each with `<hierarchy_id>:<controllers>:<path>`.
225 // We only attempt v2 if we find the unified `0::` prefix.
226 let cgroup_rel_path = cgroup_content
227 .lines()
228 .find_map(|line| line.strip_prefix("0::"))?;
229
230 // Build the absolute cgroup directory path.
231 let cgroup_dir = std::path::PathBuf::from(cgroup_fs_root).join(
232 // Strip the leading '/' so PathBuf::join doesn't replace the root.
233 cgroup_rel_path.trim_start_matches('/'),
234 );
235
236 // Walk ancestors from deepest to shallowest (inclusive of the container
237 // cgroup itself, exclusive of the root mountpoint).
238 let mut current = cgroup_dir.as_path();
239 let fs_root = std::path::Path::new(cgroup_fs_root);
240
241 loop {
242 let memory_max = current.join("memory.max");
243 if let Ok(raw) = std::fs::read_to_string(&memory_max) {
244 let trimmed = raw.trim();
245 if trimmed != "max" {
246 if let Ok(bytes) = trimmed.parse::<usize>() {
247 if bytes < ONE_TIB {
248 tracing::debug!(
249 bytes,
250 source = "cgroup_v2",
251 path = %memory_max.display(),
252 "Detected memory limit"
253 );
254 return Some(MemoryReading {
255 available_bytes: bytes,
256 source: MemorySource::CgroupV2,
257 });
258 }
259 }
260 }
261 }
262
263 // Stop at the cgroup filesystem root — don't walk above it.
264 if current == fs_root {
265 break;
266 }
267
268 match current.parent() {
269 Some(parent) => current = parent,
270 None => break,
271 }
272 }
273
274 None
275}
276
277/// Linux: parse `MemAvailable` from `/proc/meminfo` (kB → bytes).
278/// macOS: read total host RAM via `sysctl hw.memsize`.
279fn host_ram() -> Option<usize> {
280 #[cfg(target_os = "linux")]
281 return linux_meminfo_available();
282
283 #[cfg(target_os = "macos")]
284 return macos_host_ram();
285
286 #[cfg(not(any(target_os = "linux", target_os = "macos")))]
287 None
288}
289
290#[cfg(target_os = "linux")]
291fn linux_meminfo_available() -> Option<usize> {
292 let content = std::fs::read_to_string("/proc/meminfo").ok()?;
293 for line in content.lines() {
294 if line.starts_with("MemAvailable:") {
295 let kb: usize = line.split_whitespace().nth(1)?.parse().ok()?;
296 return Some(kb * 1024);
297 }
298 }
299 None
300}
301
302#[cfg(target_os = "macos")]
303fn macos_host_ram() -> Option<usize> {
304 // `sysctl -n hw.memsize` returns an integer in bytes printed to stdout.
305 let output = std::process::Command::new("sysctl")
306 .args(["-n", "hw.memsize"])
307 .output()
308 .ok()?;
309 let stdout = std::str::from_utf8(&output.stdout).ok()?.trim();
310 stdout.parse::<usize>().ok()
311}
312
313#[cfg(target_os = "linux")]
314fn linux_rss() -> Option<usize> {
315 // /proc/self/statm: all values in pages.
316 // Fields: size, rss, shared, text, lib, data, dt
317 let raw = std::fs::read_to_string("/proc/self/statm").ok()?;
318 let rss_pages: usize = raw.split_whitespace().nth(1)?.parse().ok()?;
319 // SAFETY: page_size is a compile-time constant on Linux (4096 on x86_64/arm64).
320 // We use sysconf(SC_PAGESIZE) via libc-free approach: fallback to 4096.
321 let page_size = page_size_bytes();
322 Some(rss_pages * page_size)
323}
324
325#[cfg(target_os = "linux")]
326fn page_size_bytes() -> usize {
327 // Read from /proc/self/auxv would be ideal but requires parsing ELF aux
328 // vectors. Parsing /proc/$pid/smaps is too heavy. sysconf(SC_PAGESIZE)
329 // requires libc. The practical answer on Linux/x86_64 and Linux/aarch64
330 // is always 4096; we hard-code that to avoid any unsafe.
331 4096
332}
333
334#[cfg(test)]
335mod tests;