bge_m3_embedding_server/embedder/types.rs
1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Public DTOs and the internal `EmbedRequest` enum exchanged between the
16//! pool and the worker threads.
17
18use anyhow::Result;
19use tokio::sync::oneshot;
20
21/// Sparse embedding output from the BGE-M3 sparse-linear projection layer.
22///
23/// Represents a document as a sparse vector over the tokenizer vocabulary.
24/// Token IDs with zero ReLU-gated score are omitted.
25#[derive(Debug, Clone)]
26pub struct SparseEmbedding {
27 /// Sorted vocabulary token IDs with non-zero ReLU-gated weight.
28 pub indices: Vec<usize>,
29 /// Corresponding ReLU-gated projection scores, in the same order as `indices`.
30 pub values: Vec<f32>,
31}
32
33/// Paired dense + sparse embeddings produced from a single forward pass.
34#[derive(Debug, Clone)]
35pub struct DualEmbedding {
36 pub dense: Vec<f32>,
37 pub sparse: SparseEmbedding,
38}
39
40/// OS headroom reserved for kernel, stack, ORT arena, and other non-model
41/// allocations. Subtracted from available memory before computing
42/// per-worker workspace.
43pub(crate) const OS_HEADROOM_BYTES: usize = 256 * 1024 * 1024; // 256 MiB
44
45/// Per-request diagnostic statistics captured inside the worker and forwarded
46/// to the handler layer for inclusion in the completion log event.
47#[derive(Debug, Clone, Copy, Default)]
48pub struct EmbedStats {
49 /// Number of bin-packed chunks the batch was split into.
50 pub chunks: usize,
51 /// Maximum tokenized sequence length across all chunks.
52 pub max_chunk_seq: usize,
53 /// Total token-positions processed (sum of `seq_len` for all inputs).
54 pub total_token_positions: usize,
55 /// Time spent tokenizing all inputs (milliseconds).
56 pub tokenize_ms: u64,
57 /// Total time spent in ORT `session.run()` across all chunks (milliseconds).
58 pub inference_ms: u64,
59}
60
61pub(crate) enum EmbedRequest {
62 /// Dense (float32) embedding inference on a batch of texts.
63 Dense {
64 texts: Vec<String>,
65 reply: oneshot::Sender<Result<(Vec<Vec<f32>>, EmbedStats)>>,
66 },
67 /// Sparse (SPLADE-style) embedding inference on a batch of texts.
68 Sparse {
69 texts: Vec<String>,
70 reply: oneshot::Sender<Result<(Vec<SparseEmbedding>, EmbedStats)>>,
71 },
72 /// Computes dense and sparse embeddings from a single forward pass per chunk.
73 Both {
74 texts: Vec<String>,
75 reply: oneshot::Sender<Result<(Vec<DualEmbedding>, EmbedStats)>>,
76 },
77 /// Internal: used during startup probe to run a single batch and measure
78 /// peak RSS delta. Workers only process this before `ready` is set.
79 Probe {
80 texts: Vec<String>,
81 reply: oneshot::Sender<Result<ProbeResult>>,
82 },
83}
84
85/// Result of a single probe `session.run()` call.
86pub(crate) struct ProbeResult {
87 /// Process RSS (bytes) measured immediately before `session.run()`.
88 pub rss_before: usize,
89 /// Process RSS (bytes) measured immediately after `session.run()`.
90 pub rss_after: usize,
91}