Skip to main content

bge_m3_embedding_server/probe/
corpus.rs

1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Probe text synthesis helpers.
16//!
17//! The probe sweeps `(batch, seq)` shapes by submitting synthesized texts to
18//! the leader worker. Texts come from the curated benchmark corpus; we
19//! repeat/trim corpus entries to hit the target token count for each shape.
20
21/// Loads the benchmark corpus for use as probe text material.
22///
23/// Falls back to a tiny built-in sentence if the corpus file is not found.
24pub(super) fn load_probe_texts() -> Vec<String> {
25    let corpus_path =
26        std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("benches/fixtures/corpus.json");
27    if let Ok(raw) = std::fs::read_to_string(&corpus_path) {
28        if let Ok(json) = serde_json::from_str::<serde_json::Value>(&raw) {
29            if let Some(scenarios) = json["scenarios"].as_object() {
30                let mut texts: Vec<String> = Vec::new();
31                for scenario in scenarios.values() {
32                    if let Some(arr) = scenario["texts"].as_array() {
33                        texts.extend(arr.iter().filter_map(|v| v.as_str().map(String::from)));
34                    }
35                }
36                if !texts.is_empty() {
37                    return texts;
38                }
39            }
40        }
41    }
42    // Fallback: minimal probe text.
43    vec![
44        "The embedding server startup probe synthesizes texts to measure workspace cost."
45            .to_string(),
46    ]
47}
48
49/// Synthesizes `batch` texts each of approximately `target_seq` tokens.
50///
51/// Token estimation: ~4 chars/token for natural English text.
52/// We repeat/trim corpus texts to hit the target character count.
53pub(super) fn synthesize_texts(corpus: &[String], batch: usize, target_seq: usize) -> Vec<String> {
54    let target_chars = target_seq.saturating_mul(4).max(16);
55    (0..batch)
56        .map(|i| {
57            let base = &corpus[i % corpus.len()];
58            // Repeat the base text until we have enough characters.
59            let repeated = base.repeat((target_chars / base.len().max(1)).max(2) + 1);
60            // Trim to target_chars bytes (not chars, but close enough for probing).
61            let trimmed = if repeated.len() > target_chars {
62                &repeated[..target_chars]
63            } else {
64                &repeated
65            };
66            trimmed.to_string()
67        })
68        .collect()
69}