bge_m3_embedding_server/embedder/
types.rs

1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Public DTOs and the internal `EmbedRequest` enum exchanged between the
16//! pool and the worker threads.
17
18use anyhow::Result;
19use tokio::sync::oneshot;
20
21/// Sparse embedding output from the BGE-M3 sparse-linear projection layer.
22///
23/// Represents a document as a sparse vector over the tokenizer vocabulary.
24/// Token IDs with zero ReLU-gated score are omitted.
25#[derive(Debug, Clone)]
26pub struct SparseEmbedding {
27    /// Sorted vocabulary token IDs with non-zero ReLU-gated weight.
28    pub indices: Vec<usize>,
29    /// Corresponding ReLU-gated projection scores, in the same order as `indices`.
30    pub values: Vec<f32>,
31}
32
33/// Paired dense + sparse embeddings produced from a single forward pass.
34#[derive(Debug, Clone)]
35pub struct DualEmbedding {
36    pub dense: Vec<f32>,
37    pub sparse: SparseEmbedding,
38}
39
40/// OS headroom reserved for kernel, stack, ORT arena, and other non-model
41/// allocations. Subtracted from available memory before computing
42/// per-worker workspace.
43pub(crate) const OS_HEADROOM_BYTES: usize = 256 * 1024 * 1024; // 256 MiB
44
45/// Per-request diagnostic statistics captured inside the worker and forwarded
46/// to the handler layer for inclusion in the completion log event.
47#[derive(Debug, Clone, Copy, Default)]
48pub struct EmbedStats {
49    /// Number of bin-packed chunks the batch was split into.
50    pub chunks: usize,
51    /// Maximum tokenized sequence length across all chunks.
52    pub max_chunk_seq: usize,
53    /// Total token-positions processed (sum of `seq_len` for all inputs).
54    pub total_token_positions: usize,
55    /// Time spent tokenizing all inputs (milliseconds).
56    pub tokenize_ms: u64,
57    /// Total time spent in ORT `session.run()` across all chunks (milliseconds).
58    pub inference_ms: u64,
59}
60
61pub(crate) enum EmbedRequest {
62    /// Dense (float32) embedding inference on a batch of texts.
63    Dense {
64        texts: Vec<String>,
65        reply: oneshot::Sender<Result<(Vec<Vec<f32>>, EmbedStats)>>,
66    },
67    /// Sparse (SPLADE-style) embedding inference on a batch of texts.
68    Sparse {
69        texts: Vec<String>,
70        reply: oneshot::Sender<Result<(Vec<SparseEmbedding>, EmbedStats)>>,
71    },
72    /// Computes dense and sparse embeddings from a single forward pass per chunk.
73    Both {
74        texts: Vec<String>,
75        reply: oneshot::Sender<Result<(Vec<DualEmbedding>, EmbedStats)>>,
76    },
77    /// Internal: used during startup probe to run a single batch and measure
78    /// peak RSS delta. Workers only process this before `ready` is set.
79    Probe {
80        texts: Vec<String>,
81        reply: oneshot::Sender<Result<ProbeResult>>,
82    },
83}
84
85/// Result of a single probe `session.run()` call.
86pub(crate) struct ProbeResult {
87    /// Process RSS (bytes) measured immediately before `session.run()`.
88    pub rss_before: usize,
89    /// Process RSS (bytes) measured immediately after `session.run()`.
90    pub rss_after: usize,
91}
bge_m3_embedding_server/embedder/types.rs

bge_m3_embedding_server/embedder/
types.rs