Files
smarm/benches/smarm_favored.rs
Bench 6d1c59fb99 benches: baseline results
Two compile fixes:
- tokio_favored.rs bench_mpsc_smarm: consumer spawn closure returned u64 via
  bare 'count' tail expression; smarm::Runtime::run() requires FnOnce()->().
  Fixed to 'let _ = count;'. Same fix on the consumer.join() call site.
- smarm_favored.rs bench_unc_smarm: same pattern, same fix.

Baseline run: Intel Xeon @ 2.80GHz, 1 core, kernel 6.18.5, rustc 1.95.0,
smarm 0.3.0, no RUSTFLAGS. Single-CPU sandbox — N-thread rows identical to
1-thread; scaling sweep limited to 1 thread.

Notable findings:
- deep_recursion: tokio wins (22 vs 62 us); mmap stack alloc cost dominates
  for single-use actors at depth 500.
- yield_in_hot_loop: tokio wins (138 vs 182 ms); smarm mutex overhead on
  yield_now exceeds expected naked-switch advantage on 1 CPU.
- mpsc_contention/uncontended_channel/catch_unwind_panics: smarm wins as
  predicted.
- spawn_storm_busy: smarm 47x slower; global mutex saturated by bg yielders.
2026-05-25 13:04:54 +00:00

392 lines
14 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

//! Benchmarks where smarm's design has a structural advantage.
//!
//! These exist to show what the green-thread + stackful model buys you. The
//! single-thread numbers are the most interesting ones — they isolate the
//! per-switch / per-task cost from any contention story.
//!
//! Workloads:
//! 9. deep_recursion — actor recurses 1000 deep then returns. In
//! smarm this is plain stack recursion on the
//! growable mmap'd stack. In tokio, async fn
//! can't directly recurse — each level must
//! `Box::pin` its future. We measure both.
//! 10. yield_in_hot_loop — 2 actors ping yield_now back and forth 500k
//! times. Pure context-switch cost; no
//! channels, no allocation, no contention.
//! Smarm's switch is ~6 GPRs + xmm save and a
//! `ret`; tokio's is poll → state-machine →
//! schedule.
//! 11. uncontended_channel — single producer, single consumer, 1M msgs,
//! single-threaded runtime. With no
//! cross-thread contention, smarm's
//! Arc<Mutex<>> channel is essentially free,
//! and the green-thread switch should beat
//! tokio's future polling overhead.
//! 12. catch_unwind_panics — spawn 10k tasks; half panic, half succeed.
//! Supervisor handles each. Exploratory — if
//! there's no real gap, drop this one.
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;
// ---------------------------------------------------------------------------
// Shared harness
// ---------------------------------------------------------------------------
const ITERS: u32 = 15;
fn available_threads() -> usize {
std::thread::available_parallelism().map(|n| n.get()).unwrap_or(1)
}
fn print_header(title: &str) {
println!("\n{}", "=".repeat(80));
println!(" {title}");
println!("{}", "=".repeat(80));
println!(
"{:>26} | {:>12} | {:>10} | {:>10} | {:>10}",
"runtime", "result", "median µs", "min µs", "max µs"
);
println!("{}", "-".repeat(80));
}
fn run_n<F: FnMut() -> (u64, u128)>(name: &str, n: u32, mut f: F) {
let mut times = Vec::new();
let mut last = 0u64;
let _ = f(); // warmup
for _ in 0..n {
let (v, t) = f();
times.push(t);
last = v;
}
times.sort_unstable();
let median = times[times.len() / 2];
let min = *times.iter().min().unwrap();
let max = *times.iter().max().unwrap();
println!(
"{:>26} | {:>12} | {:>10} | {:>10} | {:>10}",
name, last, median, min, max
);
}
// ---------------------------------------------------------------------------
// 9. deep_recursion — 1000 levels deep
// ---------------------------------------------------------------------------
// Each recursive frame holds an `&AtomicU64`, a `u64`, plus prologue/spill —
// conservatively ~64 B/frame on release. Smarm actor stacks are a fixed 64 KiB,
// so 500 levels (~32 KiB) leaves comfortable headroom while still being deep
// enough to exercise the stack-growth advantage over Box::pin recursion.
const RECURSE_DEPTH: u64 = 500;
fn bench_recurse_smarm(threads: usize) -> (u64, u128) {
let total = Arc::new(AtomicU64::new(0));
let t2 = total.clone();
let start = Instant::now();
smarm::runtime::init(smarm::runtime::Config::exact(threads)).run(move || {
// Plain Rust recursion on the actor's own (growable) stack.
fn recurse(c: &AtomicU64, n: u64) -> u64 {
if n == 0 {
c.fetch_add(1, Ordering::Relaxed);
0
} else {
1 + recurse(c, n - 1)
}
}
let h = smarm::spawn(move || {
let _ = recurse(&t2, RECURSE_DEPTH);
});
h.join().unwrap();
});
(total.load(Ordering::Relaxed), start.elapsed().as_micros())
}
fn bench_recurse_tokio_current() -> (u64, u128) {
let counter = Arc::new(AtomicU64::new(0));
let c2 = counter.clone();
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
let start = Instant::now();
let local = tokio::task::LocalSet::new();
local.block_on(&rt, async move {
// async fn can't self-recurse; each level returns a Box::pin'd future.
// This is the canonical workaround a real user would write.
fn recurse(
c: Arc<AtomicU64>,
n: u64,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = u64>>> {
Box::pin(async move {
if n == 0 {
c.fetch_add(1, Ordering::Relaxed);
0
} else {
1 + recurse(c, n - 1).await
}
})
}
let h = tokio::task::spawn_local(async move {
let _ = recurse(c2, RECURSE_DEPTH).await;
});
let _ = h.await;
});
(counter.load(Ordering::Relaxed), start.elapsed().as_micros())
}
fn bench_recurse_tokio_multi() -> (u64, u128) {
let counter = Arc::new(AtomicU64::new(0));
let c2 = counter.clone();
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(available_threads())
.build()
.unwrap();
let start = Instant::now();
rt.block_on(async move {
fn recurse(
c: Arc<AtomicU64>,
n: u64,
) -> std::pin::Pin<Box<dyn std::future::Future<Output = u64> + Send>> {
Box::pin(async move {
if n == 0 {
c.fetch_add(1, Ordering::Relaxed);
0
} else {
1 + recurse(c, n - 1).await
}
})
}
let h = tokio::spawn(async move {
let _ = recurse(c2, RECURSE_DEPTH).await;
});
let _ = h.await;
});
(counter.load(Ordering::Relaxed), start.elapsed().as_micros())
}
// ---------------------------------------------------------------------------
// 10. yield_in_hot_loop — 2 actors, 500k yields each, single thread
// ---------------------------------------------------------------------------
const HOT_YIELDS: u64 = 500_000;
fn bench_hot_smarm() -> (u64, u128) {
let start = Instant::now();
smarm::runtime::init(smarm::runtime::Config::exact(1)).run(|| {
let ha = smarm::spawn(|| {
for _ in 0..HOT_YIELDS {
smarm::yield_now();
}
});
let hb = smarm::spawn(|| {
for _ in 0..HOT_YIELDS {
smarm::yield_now();
}
});
ha.join().unwrap();
hb.join().unwrap();
});
(HOT_YIELDS * 2, start.elapsed().as_micros())
}
fn bench_hot_tokio_current() -> (u64, u128) {
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
let start = Instant::now();
let local = tokio::task::LocalSet::new();
local.block_on(&rt, async move {
let ha = tokio::task::spawn_local(async move {
for _ in 0..HOT_YIELDS {
tokio::task::yield_now().await;
}
});
let hb = tokio::task::spawn_local(async move {
for _ in 0..HOT_YIELDS {
tokio::task::yield_now().await;
}
});
let _ = ha.await;
let _ = hb.await;
});
(HOT_YIELDS * 2, start.elapsed().as_micros())
}
// ---------------------------------------------------------------------------
// 11. uncontended_channel — 1 producer, 1 consumer, 1M msgs, single-threaded
// ---------------------------------------------------------------------------
const UNCONT_MSGS: u64 = 1_000_000;
fn bench_unc_smarm() -> (u64, u128) {
let start = Instant::now();
smarm::runtime::init(smarm::runtime::Config::exact(1)).run(|| {
let (tx, rx) = smarm::channel::<u64>();
let consumer = smarm::spawn(move || {
let mut count = 0u64;
while let Ok(_) = rx.recv() {
count += 1;
}
let _ = count; // discard; run() closure must return ()
});
let producer = smarm::spawn(move || {
for i in 0..UNCONT_MSGS {
tx.send(i).unwrap();
}
// tx drops here, closing the channel.
});
producer.join().unwrap();
let _ = consumer.join().unwrap();
});
(UNCONT_MSGS, start.elapsed().as_micros())
}
fn bench_unc_tokio_current() -> (u64, u128) {
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
let start = Instant::now();
let local = tokio::task::LocalSet::new();
local.block_on(&rt, async move {
let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<u64>();
let consumer = tokio::task::spawn_local(async move {
let mut count = 0u64;
while let Some(_) = rx.recv().await {
count += 1;
}
count
});
let producer = tokio::task::spawn_local(async move {
for i in 0..UNCONT_MSGS {
tx.send(i).unwrap();
}
});
let _ = producer.await;
let _ = consumer.await;
});
(UNCONT_MSGS, start.elapsed().as_micros())
}
// ---------------------------------------------------------------------------
// 12. catch_unwind_panics — 10k tasks, half panic
// ---------------------------------------------------------------------------
const PANIC_TASKS: u64 = 10_000;
fn bench_panic_smarm(threads: usize) -> (u64, u128) {
let ok = Arc::new(AtomicU64::new(0));
let err = Arc::new(AtomicU64::new(0));
let ok2 = ok.clone();
let err2 = err.clone();
let start = Instant::now();
smarm::runtime::init(smarm::runtime::Config::exact(threads)).run(move || {
let mut handles = Vec::new();
for i in 0..PANIC_TASKS {
handles.push(smarm::spawn(move || {
if i % 2 == 0 {
panic!("planned");
}
}));
}
for h in handles {
match h.join() {
Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
}
}
});
let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
(total, start.elapsed().as_micros())
}
fn bench_panic_tokio_current() -> (u64, u128) {
let ok = Arc::new(AtomicU64::new(0));
let err = Arc::new(AtomicU64::new(0));
let ok2 = ok.clone();
let err2 = err.clone();
let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
let start = Instant::now();
let local = tokio::task::LocalSet::new();
local.block_on(&rt, async move {
let mut handles = Vec::new();
for i in 0..PANIC_TASKS {
handles.push(tokio::task::spawn_local(async move {
if i % 2 == 0 {
panic!("planned");
}
}));
}
for h in handles {
match h.await {
Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
}
}
});
let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
(total, start.elapsed().as_micros())
}
fn bench_panic_tokio_multi() -> (u64, u128) {
let ok = Arc::new(AtomicU64::new(0));
let err = Arc::new(AtomicU64::new(0));
let ok2 = ok.clone();
let err2 = err.clone();
let rt = tokio::runtime::Builder::new_multi_thread()
.worker_threads(available_threads())
.build()
.unwrap();
let start = Instant::now();
rt.block_on(async move {
let mut handles = Vec::new();
for i in 0..PANIC_TASKS {
handles.push(tokio::spawn(async move {
if i % 2 == 0 {
panic!("planned");
}
}));
}
for h in handles {
match h.await {
Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
}
}
});
let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
(total, start.elapsed().as_micros())
}
// ---------------------------------------------------------------------------
// main
// ---------------------------------------------------------------------------
fn main() {
let n = available_threads();
println!("smarm smarm-favored benchmarks");
println!("available parallelism: {n} threads");
println!("ITERS={ITERS} (+1 warmup, discarded)");
println!(
"RECURSE_DEPTH={RECURSE_DEPTH}, HOT_YIELDS={HOT_YIELDS}×2, \
UNCONT_MSGS={UNCONT_MSGS}, PANIC_TASKS={PANIC_TASKS}"
);
// ---- 9. deep_recursion ----
print_header(&format!("deep_recursion: depth {RECURSE_DEPTH}"));
run_n("smarm 1-thread", ITERS, || bench_recurse_smarm(1));
run_n(&format!("smarm {n}-thread"), ITERS, || bench_recurse_smarm(n));
run_n("tokio current_thread", ITERS, bench_recurse_tokio_current);
run_n("tokio multi-thread", ITERS, bench_recurse_tokio_multi);
// ---- 10. yield_in_hot_loop ----
print_header(&format!("yield_in_hot_loop: 2 actors × {HOT_YIELDS} yields (single thread)"));
run_n("smarm 1-thread", ITERS, bench_hot_smarm);
run_n("tokio current_thread", ITERS, bench_hot_tokio_current);
// ---- 11. uncontended_channel ----
print_header(&format!("uncontended_channel: 1→1, {UNCONT_MSGS} msgs (single thread)"));
run_n("smarm 1-thread", ITERS, bench_unc_smarm);
run_n("tokio current_thread", ITERS, bench_unc_tokio_current);
// ---- 12. catch_unwind_panics ----
print_header(&format!("catch_unwind_panics: {PANIC_TASKS} tasks, 50% panic"));
run_n("smarm 1-thread", ITERS, || bench_panic_smarm(1));
run_n(&format!("smarm {n}-thread"), ITERS, || bench_panic_smarm(n));
run_n("tokio current_thread", ITERS, bench_panic_tokio_current);
run_n("tokio multi-thread", ITERS, bench_panic_tokio_multi);
}