Skip to main content

bge_m3_embedding_server/
config.rs

1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Server configuration loaded from environment variables at startup.
16//!
17//! All settings are read once via [`Config::from_env`] and then immutable
18//! for the server's lifetime. See each field's doc comment for the
19//! corresponding environment variable name and default value.
20
21use crate::binpack::CostModel;
22use std::env;
23use std::time::Duration;
24use tracing::warn;
25
26/// ONNX model variant to load.
27///
28/// Controlled by `BGE_M3_MODEL`. Defaults to [`ModelVariant::Fp16`].
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30pub enum ModelVariant {
31    /// BAAI/bge-m3 FP32 model (~2.16 GB per session).
32    ///
33    /// Set `BGE_M3_MODEL=fp32` to enable. Recommended for Apple Silicon `CoreML`
34    /// deployments where latency is the primary constraint: the FP32 ONNX graph
35    /// contains no Cast nodes, so ORT can dispatch the entire multi-head
36    /// attention + FFN block as one contiguous `CoreML` subgraph to the GPU —
37    /// delivering 20–61% lower latency than the MLAS CPU baseline.
38    ///
39    /// **Not the default.** Linux/Intel (MLAS-only) deployments should prefer
40    /// [`ModelVariant::Fp16`] for lower RAM and fleet-wide embedding consistency.
41    Fp32,
42    /// Xenova/bge-m3 FP16 model (~1.08 GB per session). **Default.**
43    /// Halves per-session memory vs FP32 (~50% reduction; ~1.08 GB vs ~2.16 GB).
44    ///
45    /// This is the fleet default: all Apple Silicon `LaunchAgent` deployments set
46    /// `BGE_M3_MODEL=fp16` explicitly, and the server default matches so that
47    /// Linux/Docker deployments produce consistent embeddings without any
48    /// additional configuration.
49    ///
50    /// **Latency caveat (`CoreML` only).** The Xenova FP16 ONNX model contains
51    /// FP16↔FP32 Cast nodes at every transformer-layer boundary. ORT's `CoreML` EP
52    /// cannot fuse these into the attention/FFN subgraphs; each Cast executes on
53    /// CPU and the transformer block never forms a single contiguous GPU subgraph.
54    /// Result: FP16 + `CoreML` EP runs 6–10× slower than FP32 + `CoreML`. On
55    /// MLAS/CPU EP (Linux, Intel), this Cast overhead is similarly present but
56    /// the MLAS FP16 penalty (~6–9×) is the accepted trade-off for lower RAM and
57    /// fleet consistency. Use `BGE_M3_MODEL=fp32` on Apple Silicon to recover
58    /// `CoreML` GPU acceleration.
59    Fp16,
60    /// Xenova/bge-m3 INT8 quantized model (~568 MB per session).
61    /// Weights-only quantization; ORT dequantizes to f32 internally.
62    /// Reduces peak memory by ~74% per worker vs FP32.
63    ///
64    /// Embedding quality validated: dense cosine similarity ≥ 0.963 vs FP32
65    /// reference across a 184-text corpus — suitable for ANN search and semantic
66    /// ranking. Avoid for applications requiring ranking precision within very
67    /// small similarity margins (< 0.05 apart).
68    ///
69    /// **Use with MLAS (CPU EP) only.** `DequantizeLinear` nodes fragment the
70    /// `CoreML` execution plan identically to FP16 Cast nodes; INT8 + `CoreML` EP
71    /// runs 42–79% slower than INT8 + MLAS with no GPU benefit.
72    Int8,
73}
74
75impl std::fmt::Display for ModelVariant {
76    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
77        match self {
78            Self::Fp32 => f.write_str("fp32"),
79            Self::Fp16 => f.write_str("fp16"),
80            Self::Int8 => f.write_str("int8"),
81        }
82    }
83}
84
85/// Maximum sequence length supported by the model architecture.
86/// BGE-M3's positional embedding table extends to 8192; this is the hard upper
87/// bound used to validate `BGE_M3_MAX_SEQ_LENGTH`.
88pub const MODEL_MAX_SEQ: usize = 8192;
89
90/// Runtime configuration loaded from environment variables.
91///
92/// All fields are read once at startup via [`Config::from_env`]. Changes to
93/// environment variables after startup have no effect.
94pub struct Config {
95    /// Path to the directory where ONNX model files are cached.
96    ///
97    /// Set with `BGE_M3_CACHE_DIR`. Defaults to `/cache`.
98    pub cache_dir: String,
99    /// TCP bind address for the HTTP server.
100    ///
101    /// Set with `BGE_M3_BIND`. Defaults to `0.0.0.0:8081`.
102    /// The `0.0.0.0` default is intentional for Docker container deployments.
103    pub bind_addr: String,
104    /// Number of embedding worker threads to spawn.
105    ///
106    /// Set with `BGE_M3_WORKERS`. Defaults to `2`. Minimum effective value is `1`.
107    /// Each worker loads its own model instance.
108    pub workers: usize,
109    /// Number of intra-op threads each ORT session may use for a single
110    /// `session.run()` call (matmul / attention kernels).
111    ///
112    /// Set with `BGE_M3_INTRA_THREADS`. Defaults to `1`. Minimum effective
113    /// value is `1`.
114    ///
115    /// The default of `1` preserves predictable per-worker RSS (the workspace
116    /// probe and quadratic cost model are calibrated against single-threaded
117    /// MLAS runs). Raise this on under-utilized hosts where `BGE_M3_WORKERS *
118    /// intra_threads <= num_cpus`: e.g. on an 8 vCPU task with `workers=2`,
119    /// setting `intra_threads=4` lets each worker fan out to four cores during
120    /// inference, taking CPU utilization from ~25% to ~100% under load. Going
121    /// above `floor(num_cpus / workers)` causes thread oversubscription and
122    /// hurts throughput.
123    ///
124    /// Re-run the startup probe (do not pin coefficients) after changing this
125    /// value so the cost model captures any new scratch-buffer overhead.
126    pub intra_threads: usize,
127    /// Maximum number of input texts accepted in a single request.
128    ///
129    /// Set with `BGE_M3_MAX_BATCH`. Defaults to `256`. Minimum effective value is `1`.
130    pub max_batch: usize,
131    /// Maximum sequence length (tokens) for a single text.
132    ///
133    /// Set with `BGE_M3_MAX_SEQ_LENGTH`. Defaults to `8192` (BGE-M3's published max).
134    /// Range: `[1, 8192]`. Set lower to reduce memory footprint on constrained hardware.
135    ///
136    /// The tokenizer will silently truncate any input exceeding this length.
137    /// The probe and bin-packer use this as the upper bound when computing
138    /// workspace costs.
139    pub max_seq_length: usize,
140    /// Duration of inactivity after which workers unload their model instances from memory.
141    ///
142    /// Set with `BGE_M3_IDLE_TIMEOUT_SECS`. Defaults to `300` (5 minutes).
143    /// Set to `0` to disable idle unloading entirely.
144    ///
145    /// When unloaded, models are automatically reloaded on the next incoming request.
146    /// The reload blocks the request until complete (~5–10 s from `CoreML` compiled
147    /// cache; ~15–30 s cold).
148    pub idle_timeout: Option<Duration>,
149    /// ONNX model variant to load.
150    ///
151    /// Set with `BGE_M3_MODEL`. Accepts `"fp32"`, `"fp16"`, or `"int8"`.
152    /// Defaults to `"fp16"` for fleet-wide embedding consistency and reduced RAM
153    /// on Linux/Intel deployments. Set `BGE_M3_MODEL=fp32` on Apple Silicon to
154    /// recover `CoreML` GPU acceleration. See [`ModelVariant`] for per-variant
155    /// performance and memory trade-offs.
156    pub model_variant: ModelVariant,
157
158    // --- auto-budget and cost-model knobs ---
159    /// Fraction of estimated available workspace to actually use per worker.
160    ///
161    /// Set with `BGE_M3_MEMORY_SAFETY_FACTOR`. Defaults to `0.7` (30% headroom
162    /// for ORT arena fragmentation and spike overhead not captured by the probe).
163    /// Range: `0.1..=1.0`.
164    pub memory_safety_factor: f64,
165
166    /// If `Some`, skip the startup probe and use this cost model directly.
167    ///
168    /// Populated when:
169    /// - `BGE_M3_DISABLE_AUTO_BUDGET=1` is set (uses conservative defaults), or
170    /// - `BGE_M3_TOKEN_BUDGET` is set (translates the legacy token count to a
171    ///   `max_workspace_bytes` using conservative `a`/`b` coefficients), or
172    /// - `BGE_M3_COST_MODEL_A` and `BGE_M3_COST_MODEL_B` are both set with
173    ///   `BGE_M3_AVAILABLE_MEMORY_BYTES` (full explicit override).
174    pub cost_model_override: Option<CostModel>,
175    /// Interval (seconds) between periodic heartbeat log events.
176    ///
177    /// Set with `BGE_M3_HEARTBEAT_SECS`. Defaults to `60`.
178    /// Set to `0` to disable heartbeat logging entirely.
179    ///
180    /// Heartbeat events log RSS, live/loaded worker counts, queue depth,
181    /// available request permits, and current probe status — useful for
182    /// detecting slow memory leaks or queue saturation between requests.
183    pub heartbeat_secs: u64,
184}
185
186impl Config {
187    /// Creates a [`Config`] by reading environment variables.
188    ///
189    /// Unrecognized or missing variables fall back to their defaults.
190    #[must_use]
191    pub fn from_env() -> Self {
192        Self::from_lookup(|key| env::var(key).ok())
193    }
194
195    #[allow(clippy::too_many_lines)]
196    /// Creates a [`Config`] by resolving each setting through `lookup`.
197    ///
198    /// `lookup` receives an env-var name and returns its value if set, or
199    /// `None` to fall back to the default for that setting. Used by
200    /// [`Config::from_env`] with the real environment and in tests with a
201    /// closure over a `HashMap`.
202    pub(crate) fn from_lookup<F: Fn(&str) -> Option<String>>(lookup: F) -> Self {
203        let workers = lookup("BGE_M3_WORKERS")
204            .and_then(|v| v.parse::<usize>().ok())
205            .unwrap_or(2)
206            .max(1);
207
208        let intra_threads = lookup("BGE_M3_INTRA_THREADS")
209            .and_then(|v| v.parse::<usize>().ok())
210            .unwrap_or(1)
211            .max(1);
212
213        let max_batch = lookup("BGE_M3_MAX_BATCH")
214            .and_then(|v| v.parse::<usize>().ok())
215            .unwrap_or(256)
216            .max(1);
217
218        let max_seq_length = {
219            let raw = lookup("BGE_M3_MAX_SEQ_LENGTH")
220                .and_then(|v| v.parse::<usize>().ok())
221                .unwrap_or(MODEL_MAX_SEQ);
222            if raw == 0 || raw > MODEL_MAX_SEQ {
223                warn!(
224                    requested = raw,
225                    clamped = MODEL_MAX_SEQ,
226                    "BGE_M3_MAX_SEQ_LENGTH out of range [1, {MODEL_MAX_SEQ}]; clamping"
227                );
228                MODEL_MAX_SEQ
229            } else {
230                raw
231            }
232        };
233
234        let idle_timeout_secs = lookup("BGE_M3_IDLE_TIMEOUT_SECS")
235            .and_then(|v| v.parse::<u64>().ok())
236            .unwrap_or(300);
237        let idle_timeout = (idle_timeout_secs > 0).then(|| Duration::from_secs(idle_timeout_secs));
238
239        let model_variant = match lookup("BGE_M3_MODEL").as_deref() {
240            Some("fp32") => ModelVariant::Fp32,
241            Some("int8") => ModelVariant::Int8,
242            _ => ModelVariant::Fp16,
243        };
244
245        let memory_safety_factor = {
246            let raw = lookup("BGE_M3_MEMORY_SAFETY_FACTOR")
247                .and_then(|v| v.parse::<f64>().ok())
248                .unwrap_or(0.7);
249            raw.clamp(0.1, 1.0)
250        };
251
252        // --- cost model override resolution ---
253        // Priority:
254        //  1. BGE_M3_DISABLE_AUTO_BUDGET → conservative defaults
255        //  2. BGE_M3_TOKEN_BUDGET (legacy) → translates to max_workspace_bytes
256        //  3. BGE_M3_COST_MODEL_A + BGE_M3_COST_MODEL_B + BGE_M3_AVAILABLE_MEMORY_BYTES
257        //  4. None → probe at startup
258
259        let cost_model_override = resolve_cost_model_override(&lookup, max_seq_length);
260
261        // --- legacy BGE_M3_ONNX_BATCH_SIZE deprecation ---
262        if lookup("BGE_M3_ONNX_BATCH_SIZE").is_some() {
263            warn!(
264                "BGE_M3_ONNX_BATCH_SIZE is deprecated and will be removed in a future release. \
265                 The server now uses a quadratic-aware cost model and auto-budget probe. \
266                 Set BGE_M3_TOKEN_BUDGET to pin a specific workspace ceiling, or remove the \
267                 variable to enable fully automatic tuning."
268            );
269        }
270
271        let heartbeat_secs = lookup("BGE_M3_HEARTBEAT_SECS")
272            .and_then(|v| v.parse::<u64>().ok())
273            .unwrap_or(60);
274
275        Self {
276            cache_dir: lookup("BGE_M3_CACHE_DIR").unwrap_or_else(|| "/cache".to_string()),
277            bind_addr: lookup("BGE_M3_BIND").unwrap_or_else(|| "0.0.0.0:8081".to_string()),
278            workers,
279            intra_threads,
280            max_batch,
281            max_seq_length,
282            idle_timeout,
283            model_variant,
284            memory_safety_factor,
285            cost_model_override,
286            heartbeat_secs,
287        }
288    }
289}
290
291/// Resolves an optional `CostModel` from env vars that explicitly override auto-tuning.
292///
293/// Returns `None` when the server should run the startup probe.
294//
295// cast_precision_loss: token_budget and max_seq_length are small integers (≤ 8192)
296//   that are well within f64 mantissa range; cost-per-position is an estimate.
297// cast_possible_truncation / cast_sign_loss: the workspace result is always positive
298//   (products of positive coefficients and non-negative token counts), and fractional
299//   bytes are intentionally floored when converting back to usize.
300#[allow(
301    clippy::cast_precision_loss,
302    clippy::cast_possible_truncation,
303    clippy::cast_sign_loss
304)]
305fn resolve_cost_model_override<F: Fn(&str) -> Option<String>>(
306    lookup: &F,
307    max_seq_length: usize,
308) -> Option<CostModel> {
309    // 1. BGE_M3_DISABLE_AUTO_BUDGET — skip probe, use conservative defaults.
310    //    max_workspace_bytes comes from BGE_M3_AVAILABLE_MEMORY_BYTES if set,
311    //    otherwise uses the built-in default (2 GiB).
312    if lookup("BGE_M3_DISABLE_AUTO_BUDGET")
313        .is_some_and(|v| matches!(v.as_str(), "1" | "true" | "yes"))
314    {
315        let max_workspace = lookup("BGE_M3_AVAILABLE_MEMORY_BYTES")
316            .and_then(|v| v.parse::<usize>().ok())
317            .unwrap_or(CostModel::DEFAULT_MAX_WORKSPACE);
318        return Some(CostModel::conservative(max_workspace));
319    }
320
321    // 2. BGE_M3_TOKEN_BUDGET — legacy token-count ceiling.
322    //    Translates: max_workspace = token_budget × cost_per_token
323    //    using conservative coefficients at the configured max_seq_length.
324    if let Some(token_budget) = lookup("BGE_M3_TOKEN_BUDGET").and_then(|v| v.parse::<usize>().ok())
325    {
326        // cost_per_position at max_seq = a + b * max_seq
327        let cost_per_position =
328            CostModel::CONSERVATIVE_A + CostModel::CONSERVATIVE_B * max_seq_length as f64;
329        let max_workspace = (token_budget as f64 * cost_per_position) as usize;
330        return Some(CostModel {
331            a: CostModel::CONSERVATIVE_A,
332            b: CostModel::CONSERVATIVE_B,
333            max_workspace_bytes: max_workspace,
334        });
335    }
336
337    // 3. Explicit coefficient override — requires A, B, AND available memory.
338    if let (Some(a_str), Some(b_str)) =
339        (lookup("BGE_M3_COST_MODEL_A"), lookup("BGE_M3_COST_MODEL_B"))
340    {
341        if let (Ok(a), Ok(b)) = (a_str.parse::<f64>(), b_str.parse::<f64>()) {
342            let max_workspace = lookup("BGE_M3_AVAILABLE_MEMORY_BYTES")
343                .and_then(|v| v.parse::<usize>().ok())
344                .unwrap_or(CostModel::DEFAULT_MAX_WORKSPACE);
345            return Some(CostModel {
346                a,
347                b,
348                max_workspace_bytes: max_workspace,
349            });
350        }
351    }
352
353    // 4. No override — run the startup probe.
354    None
355}
356
357#[cfg(test)]
358mod tests;