bge_m3_embedding_server/probe/corpus.rs
1// Copyright (c) 2026 J. Patrick Fulton
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Probe text synthesis helpers.
16//!
17//! The probe sweeps `(batch, seq)` shapes by submitting synthesized texts to
18//! the leader worker. Texts come from the curated benchmark corpus; we
19//! repeat/trim corpus entries to hit the target token count for each shape.
20
21/// Loads the benchmark corpus for use as probe text material.
22///
23/// Falls back to a tiny built-in sentence if the corpus file is not found.
24pub(super) fn load_probe_texts() -> Vec<String> {
25 let corpus_path =
26 std::path::Path::new(env!("CARGO_MANIFEST_DIR")).join("benches/fixtures/corpus.json");
27 if let Ok(raw) = std::fs::read_to_string(&corpus_path) {
28 if let Ok(json) = serde_json::from_str::<serde_json::Value>(&raw) {
29 if let Some(scenarios) = json["scenarios"].as_object() {
30 let mut texts: Vec<String> = Vec::new();
31 for scenario in scenarios.values() {
32 if let Some(arr) = scenario["texts"].as_array() {
33 texts.extend(arr.iter().filter_map(|v| v.as_str().map(String::from)));
34 }
35 }
36 if !texts.is_empty() {
37 return texts;
38 }
39 }
40 }
41 }
42 // Fallback: minimal probe text.
43 vec![
44 "The embedding server startup probe synthesizes texts to measure workspace cost."
45 .to_string(),
46 ]
47}
48
49/// Synthesizes `batch` texts each of approximately `target_seq` tokens.
50///
51/// Token estimation: ~4 chars/token for natural English text.
52/// We repeat/trim corpus texts to hit the target character count.
53pub(super) fn synthesize_texts(corpus: &[String], batch: usize, target_seq: usize) -> Vec<String> {
54 let target_chars = target_seq.saturating_mul(4).max(16);
55 (0..batch)
56 .map(|i| {
57 let base = &corpus[i % corpus.len()];
58 // Repeat the base text until we have enough characters.
59 let repeated = base.repeat((target_chars / base.len().max(1)).max(2) + 1);
60 // Trim to target_chars bytes (not chars, but close enough for probing).
61 let trimmed = if repeated.len() > target_chars {
62 &repeated[..target_chars]
63 } else {
64 &repeated
65 };
66 trimmed.to_string()
67 })
68 .collect()
69}