Skip to main content

bge_m3_embedding_server/bootstrap/
budget.rs

1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Pure workspace-budget arithmetic shared between the readiness probe and
16//! the unit tests.
17
18use crate::embedder::OS_HEADROOM_BYTES;
19
20/// Computes per-worker workspace budget and derived stats from memory inputs.
21///
22/// # Returns
23///
24/// `(per_worker_workspace, worst_case_peak, utilization_pct)` where:
25/// - `per_worker_workspace`: bytes available to one worker for a single
26///   `session.run()` call (passed as `rss_ceiling` to the probe).
27/// - `worst_case_peak`: total bytes consumed when all workers run
28///   simultaneously at budget ceiling (used for the 90% OOM warning).
29/// - `utilization_pct`: `worst_case_peak / available_bytes × 100`.
30///
31/// Extracted as a pure function so the budget logic is unit-testable
32/// independently of the async readiness probe machinery.
33//
34// cast_precision_loss: available_bytes ≤ ~28 GB (Fargate limit), total_workspace
35//   similarly bounded; f64 has 2^52 mantissa (~4.5 PB) — no precision loss.
36// cast_possible_truncation: per_worker_workspace is a byte budget; truncating
37//   sub-byte fractions is intentional and harmless.
38// cast_sign_loss: total_workspace is derived from saturating_sub — always ≥ 0.
39#[allow(
40    clippy::cast_precision_loss,
41    clippy::cast_possible_truncation,
42    clippy::cast_sign_loss
43)]
44pub(super) fn compute_workspace_budget(
45    available_bytes: usize,
46    n_workers: usize,
47    model_rss_per_worker: usize,
48    safety_factor: f64,
49) -> (usize, usize, f64) {
50    let total_workspace = available_bytes
51        .saturating_sub(n_workers.saturating_mul(model_rss_per_worker))
52        .saturating_sub(OS_HEADROOM_BYTES);
53    let per_worker_workspace = (total_workspace as f64 * safety_factor / n_workers as f64) as usize;
54
55    let worst_case_peak = n_workers
56        .saturating_mul(per_worker_workspace)
57        .saturating_add(n_workers.saturating_mul(model_rss_per_worker))
58        .saturating_add(OS_HEADROOM_BYTES);
59
60    let utilization_pct = if available_bytes > 0 {
61        worst_case_peak as f64 / available_bytes as f64 * 100.0
62    } else {
63        0.0
64    };
65
66    (per_worker_workspace, worst_case_peak, utilization_pct)
67}