bge_m3_embedding_server/config.rs
1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Server configuration loaded from environment variables at startup.
16//!
17//! All settings are read once via [`Config::from_env`] and then immutable
18//! for the server's lifetime. See each field's doc comment for the
19//! corresponding environment variable name and default value.
20
21use crate::binpack::CostModel;
22use std::env;
23use std::time::Duration;
24use tracing::warn;
25
26/// ONNX model variant to load.
27///
28/// Controlled by `BGE_M3_MODEL`. Defaults to [`ModelVariant::Fp16`].
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum ModelVariant {
31 /// BAAI/bge-m3 FP32 model (~2.16 GB per session).
32 ///
33 /// Set `BGE_M3_MODEL=fp32` to enable. Recommended for Apple Silicon `CoreML`
34 /// deployments where latency is the primary constraint: the FP32 ONNX graph
35 /// contains no Cast nodes, so ORT can dispatch the entire multi-head
36 /// attention + FFN block as one contiguous `CoreML` subgraph to the GPU —
37 /// delivering 20–61% lower latency than the MLAS CPU baseline.
38 ///
39 /// **Not the default.** Linux/Intel (MLAS-only) deployments should prefer
40 /// [`ModelVariant::Fp16`] for lower RAM and fleet-wide embedding consistency.
41 Fp32,
42 /// Xenova/bge-m3 FP16 model (~1.08 GB per session). **Default.**
43 /// Halves per-session memory vs FP32 (~50% reduction; ~1.08 GB vs ~2.16 GB).
44 ///
45 /// This is the fleet default: all Apple Silicon `LaunchAgent` deployments set
46 /// `BGE_M3_MODEL=fp16` explicitly, and the server default matches so that
47 /// Linux/Docker deployments produce consistent embeddings without any
48 /// additional configuration.
49 ///
50 /// **Latency caveat (`CoreML` only).** The Xenova FP16 ONNX model contains
51 /// FP16↔FP32 Cast nodes at every transformer-layer boundary. ORT's `CoreML` EP
52 /// cannot fuse these into the attention/FFN subgraphs; each Cast executes on
53 /// CPU and the transformer block never forms a single contiguous GPU subgraph.
54 /// Result: FP16 + `CoreML` EP runs 6–10× slower than FP32 + `CoreML`. On
55 /// MLAS/CPU EP (Linux, Intel), this Cast overhead is similarly present but
56 /// the MLAS FP16 penalty (~6–9×) is the accepted trade-off for lower RAM and
57 /// fleet consistency. Use `BGE_M3_MODEL=fp32` on Apple Silicon to recover
58 /// `CoreML` GPU acceleration.
59 Fp16,
60 /// Xenova/bge-m3 INT8 quantized model (~568 MB per session).
61 /// Weights-only quantization; ORT dequantizes to f32 internally.
62 /// Reduces peak memory by ~74% per worker vs FP32.
63 ///
64 /// Embedding quality validated: dense cosine similarity ≥ 0.963 vs FP32
65 /// reference across a 184-text corpus — suitable for ANN search and semantic
66 /// ranking. Avoid for applications requiring ranking precision within very
67 /// small similarity margins (< 0.05 apart).
68 ///
69 /// **Use with MLAS (CPU EP) only.** `DequantizeLinear` nodes fragment the
70 /// `CoreML` execution plan identically to FP16 Cast nodes; INT8 + `CoreML` EP
71 /// runs 42–79% slower than INT8 + MLAS with no GPU benefit.
72 Int8,
73}
74
75impl std::fmt::Display for ModelVariant {
76 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
77 match self {
78 Self::Fp32 => f.write_str("fp32"),
79 Self::Fp16 => f.write_str("fp16"),
80 Self::Int8 => f.write_str("int8"),
81 }
82 }
83}
84
85/// Maximum sequence length supported by the model architecture.
86/// BGE-M3's positional embedding table extends to 8192; this is the hard upper
87/// bound used to validate `BGE_M3_MAX_SEQ_LENGTH`.
88pub const MODEL_MAX_SEQ: usize = 8192;
89
90/// Runtime configuration loaded from environment variables.
91///
92/// All fields are read once at startup via [`Config::from_env`]. Changes to
93/// environment variables after startup have no effect.
94pub struct Config {
95 /// Path to the directory where ONNX model files are cached.
96 ///
97 /// Set with `BGE_M3_CACHE_DIR`. Defaults to `/cache`.
98 pub cache_dir: String,
99 /// TCP bind address for the HTTP server.
100 ///
101 /// Set with `BGE_M3_BIND`. Defaults to `0.0.0.0:8081`.
102 /// The `0.0.0.0` default is intentional for Docker container deployments.
103 pub bind_addr: String,
104 /// Number of embedding worker threads to spawn.
105 ///
106 /// Set with `BGE_M3_WORKERS`. Defaults to `2`. Minimum effective value is `1`.
107 /// Each worker loads its own model instance.
108 pub workers: usize,
109 /// Number of intra-op threads each ORT session may use for a single
110 /// `session.run()` call (matmul / attention kernels).
111 ///
112 /// Set with `BGE_M3_INTRA_THREADS`. Defaults to `1`. Minimum effective
113 /// value is `1`.
114 ///
115 /// The default of `1` preserves predictable per-worker RSS (the workspace
116 /// probe and quadratic cost model are calibrated against single-threaded
117 /// MLAS runs). Raise this on under-utilized hosts where `BGE_M3_WORKERS *
118 /// intra_threads <= num_cpus`: e.g. on an 8 vCPU task with `workers=2`,
119 /// setting `intra_threads=4` lets each worker fan out to four cores during
120 /// inference, taking CPU utilization from ~25% to ~100% under load. Going
121 /// above `floor(num_cpus / workers)` causes thread oversubscription and
122 /// hurts throughput.
123 ///
124 /// Re-run the startup probe (do not pin coefficients) after changing this
125 /// value so the cost model captures any new scratch-buffer overhead.
126 pub intra_threads: usize,
127 /// Maximum number of input texts accepted in a single request.
128 ///
129 /// Set with `BGE_M3_MAX_BATCH`. Defaults to `256`. Minimum effective value is `1`.
130 pub max_batch: usize,
131 /// Maximum sequence length (tokens) for a single text.
132 ///
133 /// Set with `BGE_M3_MAX_SEQ_LENGTH`. Defaults to `8192` (BGE-M3's published max).
134 /// Range: `[1, 8192]`. Set lower to reduce memory footprint on constrained hardware.
135 ///
136 /// The tokenizer will silently truncate any input exceeding this length.
137 /// The probe and bin-packer use this as the upper bound when computing
138 /// workspace costs.
139 pub max_seq_length: usize,
140 /// Duration of inactivity after which workers unload their model instances from memory.
141 ///
142 /// Set with `BGE_M3_IDLE_TIMEOUT_SECS`. Defaults to `300` (5 minutes).
143 /// Set to `0` to disable idle unloading entirely.
144 ///
145 /// When unloaded, models are automatically reloaded on the next incoming request.
146 /// The reload blocks the request until complete (~5–10 s from `CoreML` compiled
147 /// cache; ~15–30 s cold).
148 pub idle_timeout: Option<Duration>,
149 /// ONNX model variant to load.
150 ///
151 /// Set with `BGE_M3_MODEL`. Accepts `"fp32"`, `"fp16"`, or `"int8"`.
152 /// Defaults to `"fp16"` for fleet-wide embedding consistency and reduced RAM
153 /// on Linux/Intel deployments. Set `BGE_M3_MODEL=fp32` on Apple Silicon to
154 /// recover `CoreML` GPU acceleration. See [`ModelVariant`] for per-variant
155 /// performance and memory trade-offs.
156 pub model_variant: ModelVariant,
157
158 // --- auto-budget and cost-model knobs ---
159 /// Fraction of estimated available workspace to actually use per worker.
160 ///
161 /// Set with `BGE_M3_MEMORY_SAFETY_FACTOR`. Defaults to `0.7` (30% headroom
162 /// for ORT arena fragmentation and spike overhead not captured by the probe).
163 /// Range: `0.1..=1.0`.
164 pub memory_safety_factor: f64,
165
166 /// If `Some`, skip the startup probe and use this cost model directly.
167 ///
168 /// Populated when:
169 /// - `BGE_M3_DISABLE_AUTO_BUDGET=1` is set (uses conservative defaults), or
170 /// - `BGE_M3_TOKEN_BUDGET` is set (translates the legacy token count to a
171 /// `max_workspace_bytes` using conservative `a`/`b` coefficients), or
172 /// - `BGE_M3_COST_MODEL_A` and `BGE_M3_COST_MODEL_B` are both set with
173 /// `BGE_M3_AVAILABLE_MEMORY_BYTES` (full explicit override).
174 pub cost_model_override: Option<CostModel>,
175 /// Interval (seconds) between periodic heartbeat log events.
176 ///
177 /// Set with `BGE_M3_HEARTBEAT_SECS`. Defaults to `60`.
178 /// Set to `0` to disable heartbeat logging entirely.
179 ///
180 /// Heartbeat events log RSS, live/loaded worker counts, queue depth,
181 /// available request permits, and current probe status — useful for
182 /// detecting slow memory leaks or queue saturation between requests.
183 pub heartbeat_secs: u64,
184}
185
186impl Config {
187 /// Creates a [`Config`] by reading environment variables.
188 ///
189 /// Unrecognized or missing variables fall back to their defaults.
190 #[must_use]
191 pub fn from_env() -> Self {
192 Self::from_lookup(|key| env::var(key).ok())
193 }
194
195 #[allow(clippy::too_many_lines)]
196 /// Creates a [`Config`] by resolving each setting through `lookup`.
197 ///
198 /// `lookup` receives an env-var name and returns its value if set, or
199 /// `None` to fall back to the default for that setting. Used by
200 /// [`Config::from_env`] with the real environment and in tests with a
201 /// closure over a `HashMap`.
202 pub(crate) fn from_lookup<F: Fn(&str) -> Option<String>>(lookup: F) -> Self {
203 let workers = lookup("BGE_M3_WORKERS")
204 .and_then(|v| v.parse::<usize>().ok())
205 .unwrap_or(2)
206 .max(1);
207
208 let intra_threads = lookup("BGE_M3_INTRA_THREADS")
209 .and_then(|v| v.parse::<usize>().ok())
210 .unwrap_or(1)
211 .max(1);
212
213 let max_batch = lookup("BGE_M3_MAX_BATCH")
214 .and_then(|v| v.parse::<usize>().ok())
215 .unwrap_or(256)
216 .max(1);
217
218 let max_seq_length = {
219 let raw = lookup("BGE_M3_MAX_SEQ_LENGTH")
220 .and_then(|v| v.parse::<usize>().ok())
221 .unwrap_or(MODEL_MAX_SEQ);
222 if raw == 0 || raw > MODEL_MAX_SEQ {
223 warn!(
224 requested = raw,
225 clamped = MODEL_MAX_SEQ,
226 "BGE_M3_MAX_SEQ_LENGTH out of range [1, {MODEL_MAX_SEQ}]; clamping"
227 );
228 MODEL_MAX_SEQ
229 } else {
230 raw
231 }
232 };
233
234 let idle_timeout_secs = lookup("BGE_M3_IDLE_TIMEOUT_SECS")
235 .and_then(|v| v.parse::<u64>().ok())
236 .unwrap_or(300);
237 let idle_timeout = (idle_timeout_secs > 0).then(|| Duration::from_secs(idle_timeout_secs));
238
239 let model_variant = match lookup("BGE_M3_MODEL").as_deref() {
240 Some("fp32") => ModelVariant::Fp32,
241 Some("int8") => ModelVariant::Int8,
242 _ => ModelVariant::Fp16,
243 };
244
245 let memory_safety_factor = {
246 let raw = lookup("BGE_M3_MEMORY_SAFETY_FACTOR")
247 .and_then(|v| v.parse::<f64>().ok())
248 .unwrap_or(0.7);
249 raw.clamp(0.1, 1.0)
250 };
251
252 // --- cost model override resolution ---
253 // Priority:
254 // 1. BGE_M3_DISABLE_AUTO_BUDGET → conservative defaults
255 // 2. BGE_M3_TOKEN_BUDGET (legacy) → translates to max_workspace_bytes
256 // 3. BGE_M3_COST_MODEL_A + BGE_M3_COST_MODEL_B + BGE_M3_AVAILABLE_MEMORY_BYTES
257 // 4. None → probe at startup
258
259 let cost_model_override = resolve_cost_model_override(&lookup, max_seq_length);
260
261 // --- legacy BGE_M3_ONNX_BATCH_SIZE deprecation ---
262 if lookup("BGE_M3_ONNX_BATCH_SIZE").is_some() {
263 warn!(
264 "BGE_M3_ONNX_BATCH_SIZE is deprecated and will be removed in a future release. \
265 The server now uses a quadratic-aware cost model and auto-budget probe. \
266 Set BGE_M3_TOKEN_BUDGET to pin a specific workspace ceiling, or remove the \
267 variable to enable fully automatic tuning."
268 );
269 }
270
271 let heartbeat_secs = lookup("BGE_M3_HEARTBEAT_SECS")
272 .and_then(|v| v.parse::<u64>().ok())
273 .unwrap_or(60);
274
275 Self {
276 cache_dir: lookup("BGE_M3_CACHE_DIR").unwrap_or_else(|| "/cache".to_string()),
277 bind_addr: lookup("BGE_M3_BIND").unwrap_or_else(|| "0.0.0.0:8081".to_string()),
278 workers,
279 intra_threads,
280 max_batch,
281 max_seq_length,
282 idle_timeout,
283 model_variant,
284 memory_safety_factor,
285 cost_model_override,
286 heartbeat_secs,
287 }
288 }
289}
290
291/// Resolves an optional `CostModel` from env vars that explicitly override auto-tuning.
292///
293/// Returns `None` when the server should run the startup probe.
294//
295// cast_precision_loss: token_budget and max_seq_length are small integers (≤ 8192)
296// that are well within f64 mantissa range; cost-per-position is an estimate.
297// cast_possible_truncation / cast_sign_loss: the workspace result is always positive
298// (products of positive coefficients and non-negative token counts), and fractional
299// bytes are intentionally floored when converting back to usize.
300#[allow(
301 clippy::cast_precision_loss,
302 clippy::cast_possible_truncation,
303 clippy::cast_sign_loss
304)]
305fn resolve_cost_model_override<F: Fn(&str) -> Option<String>>(
306 lookup: &F,
307 max_seq_length: usize,
308) -> Option<CostModel> {
309 // 1. BGE_M3_DISABLE_AUTO_BUDGET — skip probe, use conservative defaults.
310 // max_workspace_bytes comes from BGE_M3_AVAILABLE_MEMORY_BYTES if set,
311 // otherwise uses the built-in default (2 GiB).
312 if lookup("BGE_M3_DISABLE_AUTO_BUDGET")
313 .is_some_and(|v| matches!(v.as_str(), "1" | "true" | "yes"))
314 {
315 let max_workspace = lookup("BGE_M3_AVAILABLE_MEMORY_BYTES")
316 .and_then(|v| v.parse::<usize>().ok())
317 .unwrap_or(CostModel::DEFAULT_MAX_WORKSPACE);
318 return Some(CostModel::conservative(max_workspace));
319 }
320
321 // 2. BGE_M3_TOKEN_BUDGET — legacy token-count ceiling.
322 // Translates: max_workspace = token_budget × cost_per_token
323 // using conservative coefficients at the configured max_seq_length.
324 if let Some(token_budget) = lookup("BGE_M3_TOKEN_BUDGET").and_then(|v| v.parse::<usize>().ok())
325 {
326 // cost_per_position at max_seq = a + b * max_seq
327 let cost_per_position =
328 CostModel::CONSERVATIVE_A + CostModel::CONSERVATIVE_B * max_seq_length as f64;
329 let max_workspace = (token_budget as f64 * cost_per_position) as usize;
330 return Some(CostModel {
331 a: CostModel::CONSERVATIVE_A,
332 b: CostModel::CONSERVATIVE_B,
333 max_workspace_bytes: max_workspace,
334 });
335 }
336
337 // 3. Explicit coefficient override — requires A, B, AND available memory.
338 if let (Some(a_str), Some(b_str)) =
339 (lookup("BGE_M3_COST_MODEL_A"), lookup("BGE_M3_COST_MODEL_B"))
340 {
341 if let (Ok(a), Ok(b)) = (a_str.parse::<f64>(), b_str.parse::<f64>()) {
342 let max_workspace = lookup("BGE_M3_AVAILABLE_MEMORY_BYTES")
343 .and_then(|v| v.parse::<usize>().ok())
344 .unwrap_or(CostModel::DEFAULT_MAX_WORKSPACE);
345 return Some(CostModel {
346 a,
347 b,
348 max_workspace_bytes: max_workspace,
349 });
350 }
351 }
352
353 // 4. No override — run the startup probe.
354 None
355}
356
357#[cfg(test)]
358mod tests;