bge_m3_embedding_server/bootstrap/budget.rs
1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Pure workspace-budget arithmetic shared between the readiness probe and
16//! the unit tests.
17
18use crate::embedder::OS_HEADROOM_BYTES;
19
20/// Computes per-worker workspace budget and derived stats from memory inputs.
21///
22/// # Returns
23///
24/// `(per_worker_workspace, worst_case_peak, utilization_pct)` where:
25/// - `per_worker_workspace`: bytes available to one worker for a single
26/// `session.run()` call (passed as `rss_ceiling` to the probe).
27/// - `worst_case_peak`: total bytes consumed when all workers run
28/// simultaneously at budget ceiling (used for the 90% OOM warning).
29/// - `utilization_pct`: `worst_case_peak / available_bytes × 100`.
30///
31/// Extracted as a pure function so the budget logic is unit-testable
32/// independently of the async readiness probe machinery.
33//
34// cast_precision_loss: available_bytes ≤ ~28 GB (Fargate limit), total_workspace
35// similarly bounded; f64 has 2^52 mantissa (~4.5 PB) — no precision loss.
36// cast_possible_truncation: per_worker_workspace is a byte budget; truncating
37// sub-byte fractions is intentional and harmless.
38// cast_sign_loss: total_workspace is derived from saturating_sub — always ≥ 0.
39#[allow(
40 clippy::cast_precision_loss,
41 clippy::cast_possible_truncation,
42 clippy::cast_sign_loss
43)]
44pub(super) fn compute_workspace_budget(
45 available_bytes: usize,
46 n_workers: usize,
47 model_rss_per_worker: usize,
48 safety_factor: f64,
49) -> (usize, usize, f64) {
50 let total_workspace = available_bytes
51 .saturating_sub(n_workers.saturating_mul(model_rss_per_worker))
52 .saturating_sub(OS_HEADROOM_BYTES);
53 let per_worker_workspace = (total_workspace as f64 * safety_factor / n_workers as f64) as usize;
54
55 let worst_case_peak = n_workers
56 .saturating_mul(per_worker_workspace)
57 .saturating_add(n_workers.saturating_mul(model_rss_per_worker))
58 .saturating_add(OS_HEADROOM_BYTES);
59
60 let utilization_pct = if available_bytes > 0 {
61 worst_case_peak as f64 / available_bytes as f64 * 100.0
62 } else {
63 0.0
64 };
65
66 (per_worker_workspace, worst_case_peak, utilization_pct)
67}