Config API changes (src/preempt.rs, src/runtime.rs):
- preempt: promote ALLOC_INTERVAL and TIMESLICE_CYCLES from bare consts to
DEFAULT_ALLOC_INTERVAL / DEFAULT_TIMESLICE_CYCLES; store active values in
thread-locals set on each actor resume so multiple runtimes can use
different settings concurrently.
- runtime: add alloc_interval / timeslice_cycles fields to Config; add
Config::alloc_interval(n) and Config::timeslice_cycles(c) builder methods;
thread the values through RuntimeInner to the reset_timeslice() call in
schedule_loop.
Bench changes:
- Add bench_cfg(threads) helper to general/tokio_favored/smarm_favored that
wraps Config::exact and reads SMARM_ALLOC_INTERVAL / SMARM_TIMESLICE_CYCLES
env vars, so the sweep script can vary knobs without recompiling.
Sweep tooling (benches/sweep.py):
- 'run': run the 3-file bench suite once; --save-baseline persists JSON
- 'regress': compare current run against baseline.json, exit 1 on any bench
that regresses >10% vs stored medians
- 'sweep': run the full SWEEP_GRID (10 points), print comparison table,
optional --save-csv; binaries pre-built so no recompile per point
Sweep results (10-point grid, 1-CPU sandbox):
- The preemption knobs have very little effect on this single-CPU machine.
Most benches move <5% across the entire grid.
- Longer timeslices (tc=600k, tc=1200k) reliably hurt spawn_storm_busy
(+11-15%) and catch_unwind_panics (+10-12%) because actors hold the
scheduler mutex longer per timeslice, stalling the storm of joinable tasks.
- Shorter timeslices (tc=150k) give a small improvement on many_timers
(-3-4%) and a wash everywhere else.
- yield_in_hot_loop and uncontended_channel are essentially flat across all
knobs — both are scheduling-dominated and call yield_now explicitly, so
the RDTSC-driven preemption path is irrelevant.
- Conclusion: the knobs matter primarily under contention (multi-core).
Re-run sweep on a multi-core machine before drawing tuning conclusions.
409 lines
15 KiB
Rust
409 lines
15 KiB
Rust
//! Benchmarks where smarm's design has a structural advantage.
|
||
//!
|
||
//! These exist to show what the green-thread + stackful model buys you. The
|
||
//! single-thread numbers are the most interesting ones — they isolate the
|
||
//! per-switch / per-task cost from any contention story.
|
||
//!
|
||
//! Workloads:
|
||
//! 9. deep_recursion — actor recurses 1000 deep then returns. In
|
||
//! smarm this is plain stack recursion on the
|
||
//! growable mmap'd stack. In tokio, async fn
|
||
//! can't directly recurse — each level must
|
||
//! `Box::pin` its future. We measure both.
|
||
//! 10. yield_in_hot_loop — 2 actors ping yield_now back and forth 500k
|
||
//! times. Pure context-switch cost; no
|
||
//! channels, no allocation, no contention.
|
||
//! Smarm's switch is ~6 GPRs + xmm save and a
|
||
//! `ret`; tokio's is poll → state-machine →
|
||
//! schedule.
|
||
//! 11. uncontended_channel — single producer, single consumer, 1M msgs,
|
||
//! single-threaded runtime. With no
|
||
//! cross-thread contention, smarm's
|
||
//! Arc<Mutex<>> channel is essentially free,
|
||
//! and the green-thread switch should beat
|
||
//! tokio's future polling overhead.
|
||
//! 12. catch_unwind_panics — spawn 10k tasks; half panic, half succeed.
|
||
//! Supervisor handles each. Exploratory — if
|
||
//! there's no real gap, drop this one.
|
||
|
||
use std::sync::atomic::{AtomicU64, Ordering};
|
||
use std::sync::Arc;
|
||
use std::time::Instant;
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Shared harness
|
||
// ---------------------------------------------------------------------------
|
||
|
||
const ITERS: u32 = 15;
|
||
|
||
fn available_threads() -> usize {
|
||
std::thread::available_parallelism().map(|n| n.get()).unwrap_or(1)
|
||
}
|
||
|
||
fn print_header(title: &str) {
|
||
println!("\n{}", "=".repeat(80));
|
||
println!(" {title}");
|
||
println!("{}", "=".repeat(80));
|
||
println!(
|
||
"{:>26} | {:>12} | {:>10} | {:>10} | {:>10}",
|
||
"runtime", "result", "median µs", "min µs", "max µs"
|
||
);
|
||
println!("{}", "-".repeat(80));
|
||
}
|
||
|
||
fn run_n<F: FnMut() -> (u64, u128)>(name: &str, n: u32, mut f: F) {
|
||
let mut times = Vec::new();
|
||
let mut last = 0u64;
|
||
let _ = f(); // warmup
|
||
for _ in 0..n {
|
||
let (v, t) = f();
|
||
times.push(t);
|
||
last = v;
|
||
}
|
||
times.sort_unstable();
|
||
let median = times[times.len() / 2];
|
||
let min = *times.iter().min().unwrap();
|
||
let max = *times.iter().max().unwrap();
|
||
println!(
|
||
"{:>26} | {:>12} | {:>10} | {:>10} | {:>10}",
|
||
name, last, median, min, max
|
||
);
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// 9. deep_recursion — 1000 levels deep
|
||
// ---------------------------------------------------------------------------
|
||
|
||
// Each recursive frame holds an `&AtomicU64`, a `u64`, plus prologue/spill —
|
||
// conservatively ~64 B/frame on release. Smarm actor stacks are a fixed 64 KiB,
|
||
// so 500 levels (~32 KiB) leaves comfortable headroom while still being deep
|
||
// enough to exercise the stack-growth advantage over Box::pin recursion.
|
||
const RECURSE_DEPTH: u64 = 500;
|
||
|
||
fn bench_recurse_smarm(threads: usize) -> (u64, u128) {
|
||
let total = Arc::new(AtomicU64::new(0));
|
||
let t2 = total.clone();
|
||
let start = Instant::now();
|
||
smarm::runtime::init(bench_cfg(threads)).run(move || {
|
||
// Plain Rust recursion on the actor's own (growable) stack.
|
||
fn recurse(c: &AtomicU64, n: u64) -> u64 {
|
||
if n == 0 {
|
||
c.fetch_add(1, Ordering::Relaxed);
|
||
0
|
||
} else {
|
||
1 + recurse(c, n - 1)
|
||
}
|
||
}
|
||
let h = smarm::spawn(move || {
|
||
let _ = recurse(&t2, RECURSE_DEPTH);
|
||
});
|
||
h.join().unwrap();
|
||
});
|
||
(total.load(Ordering::Relaxed), start.elapsed().as_micros())
|
||
}
|
||
|
||
fn bench_recurse_tokio_current() -> (u64, u128) {
|
||
let counter = Arc::new(AtomicU64::new(0));
|
||
let c2 = counter.clone();
|
||
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
|
||
let start = Instant::now();
|
||
let local = tokio::task::LocalSet::new();
|
||
local.block_on(&rt, async move {
|
||
// async fn can't self-recurse; each level returns a Box::pin'd future.
|
||
// This is the canonical workaround a real user would write.
|
||
fn recurse(
|
||
c: Arc<AtomicU64>,
|
||
n: u64,
|
||
) -> std::pin::Pin<Box<dyn std::future::Future<Output = u64>>> {
|
||
Box::pin(async move {
|
||
if n == 0 {
|
||
c.fetch_add(1, Ordering::Relaxed);
|
||
0
|
||
} else {
|
||
1 + recurse(c, n - 1).await
|
||
}
|
||
})
|
||
}
|
||
let h = tokio::task::spawn_local(async move {
|
||
let _ = recurse(c2, RECURSE_DEPTH).await;
|
||
});
|
||
let _ = h.await;
|
||
});
|
||
(counter.load(Ordering::Relaxed), start.elapsed().as_micros())
|
||
}
|
||
|
||
fn bench_recurse_tokio_multi() -> (u64, u128) {
|
||
let counter = Arc::new(AtomicU64::new(0));
|
||
let c2 = counter.clone();
|
||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||
.worker_threads(available_threads())
|
||
.build()
|
||
.unwrap();
|
||
let start = Instant::now();
|
||
rt.block_on(async move {
|
||
fn recurse(
|
||
c: Arc<AtomicU64>,
|
||
n: u64,
|
||
) -> std::pin::Pin<Box<dyn std::future::Future<Output = u64> + Send>> {
|
||
Box::pin(async move {
|
||
if n == 0 {
|
||
c.fetch_add(1, Ordering::Relaxed);
|
||
0
|
||
} else {
|
||
1 + recurse(c, n - 1).await
|
||
}
|
||
})
|
||
}
|
||
let h = tokio::spawn(async move {
|
||
let _ = recurse(c2, RECURSE_DEPTH).await;
|
||
});
|
||
let _ = h.await;
|
||
});
|
||
(counter.load(Ordering::Relaxed), start.elapsed().as_micros())
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// 10. yield_in_hot_loop — 2 actors, 500k yields each, single thread
|
||
// ---------------------------------------------------------------------------
|
||
|
||
const HOT_YIELDS: u64 = 500_000;
|
||
|
||
fn bench_hot_smarm() -> (u64, u128) {
|
||
let start = Instant::now();
|
||
smarm::runtime::init(bench_cfg(1)).run(|| {
|
||
let ha = smarm::spawn(|| {
|
||
for _ in 0..HOT_YIELDS {
|
||
smarm::yield_now();
|
||
}
|
||
});
|
||
let hb = smarm::spawn(|| {
|
||
for _ in 0..HOT_YIELDS {
|
||
smarm::yield_now();
|
||
}
|
||
});
|
||
ha.join().unwrap();
|
||
hb.join().unwrap();
|
||
});
|
||
(HOT_YIELDS * 2, start.elapsed().as_micros())
|
||
}
|
||
|
||
fn bench_hot_tokio_current() -> (u64, u128) {
|
||
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
|
||
let start = Instant::now();
|
||
let local = tokio::task::LocalSet::new();
|
||
local.block_on(&rt, async move {
|
||
let ha = tokio::task::spawn_local(async move {
|
||
for _ in 0..HOT_YIELDS {
|
||
tokio::task::yield_now().await;
|
||
}
|
||
});
|
||
let hb = tokio::task::spawn_local(async move {
|
||
for _ in 0..HOT_YIELDS {
|
||
tokio::task::yield_now().await;
|
||
}
|
||
});
|
||
let _ = ha.await;
|
||
let _ = hb.await;
|
||
});
|
||
(HOT_YIELDS * 2, start.elapsed().as_micros())
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// 11. uncontended_channel — 1 producer, 1 consumer, 1M msgs, single-threaded
|
||
// ---------------------------------------------------------------------------
|
||
|
||
const UNCONT_MSGS: u64 = 1_000_000;
|
||
|
||
fn bench_unc_smarm() -> (u64, u128) {
|
||
let start = Instant::now();
|
||
smarm::runtime::init(bench_cfg(1)).run(|| {
|
||
let (tx, rx) = smarm::channel::<u64>();
|
||
let consumer = smarm::spawn(move || {
|
||
let mut count = 0u64;
|
||
while let Ok(_) = rx.recv() {
|
||
count += 1;
|
||
}
|
||
let _ = count; // discard; run() closure must return ()
|
||
});
|
||
let producer = smarm::spawn(move || {
|
||
for i in 0..UNCONT_MSGS {
|
||
tx.send(i).unwrap();
|
||
}
|
||
// tx drops here, closing the channel.
|
||
});
|
||
producer.join().unwrap();
|
||
let _ = consumer.join().unwrap();
|
||
});
|
||
(UNCONT_MSGS, start.elapsed().as_micros())
|
||
}
|
||
|
||
fn bench_unc_tokio_current() -> (u64, u128) {
|
||
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
|
||
let start = Instant::now();
|
||
let local = tokio::task::LocalSet::new();
|
||
local.block_on(&rt, async move {
|
||
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<u64>();
|
||
let consumer = tokio::task::spawn_local(async move {
|
||
let mut count = 0u64;
|
||
while let Some(_) = rx.recv().await {
|
||
count += 1;
|
||
}
|
||
count
|
||
});
|
||
let producer = tokio::task::spawn_local(async move {
|
||
for i in 0..UNCONT_MSGS {
|
||
tx.send(i).unwrap();
|
||
}
|
||
});
|
||
let _ = producer.await;
|
||
let _ = consumer.await;
|
||
});
|
||
(UNCONT_MSGS, start.elapsed().as_micros())
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// 12. catch_unwind_panics — 10k tasks, half panic
|
||
// ---------------------------------------------------------------------------
|
||
|
||
const PANIC_TASKS: u64 = 10_000;
|
||
|
||
fn bench_panic_smarm(threads: usize) -> (u64, u128) {
|
||
let ok = Arc::new(AtomicU64::new(0));
|
||
let err = Arc::new(AtomicU64::new(0));
|
||
let ok2 = ok.clone();
|
||
let err2 = err.clone();
|
||
let start = Instant::now();
|
||
smarm::runtime::init(bench_cfg(threads)).run(move || {
|
||
let mut handles = Vec::new();
|
||
for i in 0..PANIC_TASKS {
|
||
handles.push(smarm::spawn(move || {
|
||
if i % 2 == 0 {
|
||
panic!("planned");
|
||
}
|
||
}));
|
||
}
|
||
for h in handles {
|
||
match h.join() {
|
||
Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
|
||
Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
|
||
}
|
||
}
|
||
});
|
||
let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
|
||
(total, start.elapsed().as_micros())
|
||
}
|
||
|
||
fn bench_panic_tokio_current() -> (u64, u128) {
|
||
let ok = Arc::new(AtomicU64::new(0));
|
||
let err = Arc::new(AtomicU64::new(0));
|
||
let ok2 = ok.clone();
|
||
let err2 = err.clone();
|
||
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
|
||
let start = Instant::now();
|
||
let local = tokio::task::LocalSet::new();
|
||
local.block_on(&rt, async move {
|
||
let mut handles = Vec::new();
|
||
for i in 0..PANIC_TASKS {
|
||
handles.push(tokio::task::spawn_local(async move {
|
||
if i % 2 == 0 {
|
||
panic!("planned");
|
||
}
|
||
}));
|
||
}
|
||
for h in handles {
|
||
match h.await {
|
||
Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
|
||
Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
|
||
}
|
||
}
|
||
});
|
||
let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
|
||
(total, start.elapsed().as_micros())
|
||
}
|
||
|
||
fn bench_panic_tokio_multi() -> (u64, u128) {
|
||
let ok = Arc::new(AtomicU64::new(0));
|
||
let err = Arc::new(AtomicU64::new(0));
|
||
let ok2 = ok.clone();
|
||
let err2 = err.clone();
|
||
let rt = tokio::runtime::Builder::new_multi_thread()
|
||
.worker_threads(available_threads())
|
||
.build()
|
||
.unwrap();
|
||
let start = Instant::now();
|
||
rt.block_on(async move {
|
||
let mut handles = Vec::new();
|
||
for i in 0..PANIC_TASKS {
|
||
handles.push(tokio::spawn(async move {
|
||
if i % 2 == 0 {
|
||
panic!("planned");
|
||
}
|
||
}));
|
||
}
|
||
for h in handles {
|
||
match h.await {
|
||
Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
|
||
Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
|
||
}
|
||
}
|
||
});
|
||
let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
|
||
(total, start.elapsed().as_micros())
|
||
}
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// main
|
||
// ---------------------------------------------------------------------------
|
||
|
||
|
||
// ---------------------------------------------------------------------------
|
||
// Knob helper — reads SMARM_ALLOC_INTERVAL / SMARM_TIMESLICE_CYCLES env vars
|
||
// so the sweep script can override the preemption knobs without recompiling.
|
||
// ---------------------------------------------------------------------------
|
||
|
||
fn bench_cfg(threads: usize) -> smarm::runtime::Config {
|
||
let mut cfg = smarm::runtime::Config::exact(threads);
|
||
if let Ok(v) = std::env::var("SMARM_ALLOC_INTERVAL") {
|
||
if let Ok(n) = v.parse::<u32>() { cfg = cfg.alloc_interval(n); }
|
||
}
|
||
if let Ok(v) = std::env::var("SMARM_TIMESLICE_CYCLES") {
|
||
if let Ok(n) = v.parse::<u64>() { cfg = cfg.timeslice_cycles(n); }
|
||
}
|
||
cfg
|
||
}
|
||
|
||
fn main() {
|
||
let n = available_threads();
|
||
println!("smarm smarm-favored benchmarks");
|
||
println!("available parallelism: {n} threads");
|
||
println!("ITERS={ITERS} (+1 warmup, discarded)");
|
||
println!(
|
||
"RECURSE_DEPTH={RECURSE_DEPTH}, HOT_YIELDS={HOT_YIELDS}×2, \
|
||
UNCONT_MSGS={UNCONT_MSGS}, PANIC_TASKS={PANIC_TASKS}"
|
||
);
|
||
|
||
// ---- 9. deep_recursion ----
|
||
print_header(&format!("deep_recursion: depth {RECURSE_DEPTH}"));
|
||
run_n("smarm 1-thread", ITERS, || bench_recurse_smarm(1));
|
||
run_n(&format!("smarm {n}-thread"), ITERS, || bench_recurse_smarm(n));
|
||
run_n("tokio current_thread", ITERS, bench_recurse_tokio_current);
|
||
run_n("tokio multi-thread", ITERS, bench_recurse_tokio_multi);
|
||
|
||
// ---- 10. yield_in_hot_loop ----
|
||
print_header(&format!("yield_in_hot_loop: 2 actors × {HOT_YIELDS} yields (single thread)"));
|
||
run_n("smarm 1-thread", ITERS, bench_hot_smarm);
|
||
run_n("tokio current_thread", ITERS, bench_hot_tokio_current);
|
||
|
||
// ---- 11. uncontended_channel ----
|
||
print_header(&format!("uncontended_channel: 1→1, {UNCONT_MSGS} msgs (single thread)"));
|
||
run_n("smarm 1-thread", ITERS, bench_unc_smarm);
|
||
run_n("tokio current_thread", ITERS, bench_unc_tokio_current);
|
||
|
||
// ---- 12. catch_unwind_panics ----
|
||
print_header(&format!("catch_unwind_panics: {PANIC_TASKS} tasks, 50% panic"));
|
||
run_n("smarm 1-thread", ITERS, || bench_panic_smarm(1));
|
||
run_n(&format!("smarm {n}-thread"), ITERS, || bench_panic_smarm(n));
|
||
run_n("tokio current_thread", ITERS, bench_panic_tokio_current);
|
||
run_n("tokio multi-thread", ITERS, bench_panic_tokio_multi);
|
||
}
|