//! General benchmarks — workloads where neither runtime has a structural //! advantage. Both should be competitive; large gaps here indicate a real //! difference in per-task or per-yield overhead. //! //! Workloads: //! 1. chained_spawn — task N spawns N+1, depth 1000. Spawn+exit overhead in //! a serial chain. Adapted from tokio's bench of the same //! name. //! 2. yield_many — 200 actors × 1000 yields. Pure scheduling throughput //! with no allocation, no IO. Adapted from tokio. //! 3. fan_out_compute— count primes in [2, 400_000) across 64 workers. Same //! shape as multi_scheduler::primes but lives here for //! completeness. //! 4. ping_pong_oneshot — N rounds of (spawn pair, send oneshot, await). //! Closer to a request/response workload than channel //! ping-pong. use std::sync::atomic::{AtomicU64, Ordering}; use std::sync::Arc; use std::time::Instant; // --------------------------------------------------------------------------- // Shared harness // --------------------------------------------------------------------------- const ITERS: u32 = 15; fn available_threads() -> usize { std::thread::available_parallelism().map(|n| n.get()).unwrap_or(1) } fn print_header(title: &str) { println!("\n{}", "=".repeat(80)); println!(" {title}"); println!("{}", "=".repeat(80)); println!( "{:>26} | {:>12} | {:>10} | {:>10} | {:>10}", "runtime", "result", "median µs", "min µs", "max µs" ); println!("{}", "-".repeat(80)); } fn run_n (u64, u128)>(name: &str, n: u32, mut f: F) { let mut times = Vec::new(); let mut last = 0u64; // One warmup iteration, discarded. let _ = f(); for _ in 0..n { let (v, t) = f(); times.push(t); last = v; } times.sort_unstable(); let median = times[times.len() / 2]; let min = *times.iter().min().unwrap(); let max = *times.iter().max().unwrap(); println!( "{:>26} | {:>12} | {:>10} | {:>10} | {:>10}", name, last, median, min, max ); } // --------------------------------------------------------------------------- // 1. chained_spawn — depth 1000 // --------------------------------------------------------------------------- const CHAIN_DEPTH: u64 = 1_000; fn bench_chained_smarm(threads: usize) -> (u64, u128) { let counter = Arc::new(AtomicU64::new(0)); let c2 = counter.clone(); let start = Instant::now(); smarm::runtime::init(bench_cfg(threads)).run(move || { // Fire-and-forget chain, matching tokio's bench shape: each link // spawns the next link and exits immediately; depth 0 signals done // via a channel. Crucially this does *not* nest joins on the // spawner's stack — important because smarm actor stacks are a // fixed 64 KiB. let (tx, rx) = smarm::channel::<()>(); fn iter(c: Arc, tx: smarm::Sender<()>, n: u64) { if n == 0 { tx.send(()).unwrap(); } else { let cc = c.clone(); smarm::spawn(move || { cc.fetch_add(1, Ordering::Relaxed); iter(cc.clone(), tx, n - 1); }); // Caller exits; JoinHandle dropped, no parking. } } iter(c2, tx, CHAIN_DEPTH); rx.recv().unwrap(); }); (counter.load(Ordering::Relaxed), start.elapsed().as_micros()) } fn bench_chained_tokio_current() -> (u64, u128) { let counter = Arc::new(AtomicU64::new(0)); let c2 = counter.clone(); let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); let start = Instant::now(); let local = tokio::task::LocalSet::new(); local.block_on(&rt, async move { // Use a oneshot done channel like tokio's own chained_spawn bench. let (done_tx, done_rx) = tokio::sync::oneshot::channel(); fn iter( c: Arc, done: tokio::sync::oneshot::Sender<()>, n: u64, ) { if n == 0 { let _ = done.send(()); } else { tokio::task::spawn_local(async move { c.fetch_add(1, Ordering::Relaxed); iter(c, done, n - 1); }); } } iter(c2, done_tx, CHAIN_DEPTH); let _ = done_rx.await; }); (counter.load(Ordering::Relaxed), start.elapsed().as_micros()) } fn bench_chained_tokio_multi() -> (u64, u128) { let counter = Arc::new(AtomicU64::new(0)); let c2 = counter.clone(); let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(available_threads()) .build() .unwrap(); let start = Instant::now(); rt.block_on(async move { let (done_tx, done_rx) = tokio::sync::oneshot::channel(); fn iter(c: Arc, done: tokio::sync::oneshot::Sender<()>, n: u64) { if n == 0 { let _ = done.send(()); } else { tokio::spawn(async move { c.fetch_add(1, Ordering::Relaxed); iter(c, done, n - 1); }); } } iter(c2, done_tx, CHAIN_DEPTH); let _ = done_rx.await; }); (counter.load(Ordering::Relaxed), start.elapsed().as_micros()) } // --------------------------------------------------------------------------- // 2. yield_many — 200 actors × 1000 yields // --------------------------------------------------------------------------- const YIELD_TASKS: u64 = 200; const YIELD_ROUNDS: u64 = 1_000; fn bench_yield_smarm(threads: usize) -> (u64, u128) { let start = Instant::now(); smarm::runtime::init(bench_cfg(threads)).run(|| { let mut handles = Vec::new(); for _ in 0..YIELD_TASKS { handles.push(smarm::spawn(|| { for _ in 0..YIELD_ROUNDS { smarm::yield_now(); } })); } for h in handles { h.join().unwrap(); } }); (YIELD_TASKS * YIELD_ROUNDS, start.elapsed().as_micros()) } fn bench_yield_tokio_current() -> (u64, u128) { let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); let start = Instant::now(); let local = tokio::task::LocalSet::new(); local.block_on(&rt, async move { let mut handles = Vec::new(); for _ in 0..YIELD_TASKS { handles.push(tokio::task::spawn_local(async move { for _ in 0..YIELD_ROUNDS { tokio::task::yield_now().await; } })); } for h in handles { let _ = h.await; } }); (YIELD_TASKS * YIELD_ROUNDS, start.elapsed().as_micros()) } fn bench_yield_tokio_multi() -> (u64, u128) { let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(available_threads()) .build() .unwrap(); let start = Instant::now(); rt.block_on(async move { let mut handles = Vec::new(); for _ in 0..YIELD_TASKS { handles.push(tokio::spawn(async move { for _ in 0..YIELD_ROUNDS { tokio::task::yield_now().await; } })); } for h in handles { let _ = h.await; } }); (YIELD_TASKS * YIELD_ROUNDS, start.elapsed().as_micros()) } // --------------------------------------------------------------------------- // 3. fan_out_compute — primes, same shape as multi_scheduler::primes // --------------------------------------------------------------------------- const PRIME_N: u64 = 400_000; const PRIME_WORKERS: u64 = 64; fn is_prime(n: u64) -> bool { if n < 2 { return false; } if n < 4 { return true; } if n % 2 == 0 { return false; } let mut i = 3u64; while i * i <= n { if n % i == 0 { return false; } i += 2; } true } fn count_primes(lo: u64, hi: u64) -> u64 { (lo..hi).filter(|&n| is_prime(n)).count() as u64 } fn primes_slice(w: u64) -> (u64, u64) { let per = PRIME_N / PRIME_WORKERS; let lo = w * per; let hi = if w + 1 == PRIME_WORKERS { PRIME_N } else { lo + per }; (lo, hi) } fn bench_primes_smarm(threads: usize) -> (u64, u128) { let total = Arc::new(AtomicU64::new(0)); let t2 = total.clone(); let start = Instant::now(); smarm::runtime::init(bench_cfg(threads)).run(move || { let mut handles = Vec::new(); for w in 0..PRIME_WORKERS { let (lo, hi) = primes_slice(w); let tc = t2.clone(); handles.push(smarm::spawn(move || { tc.fetch_add(count_primes(lo, hi), Ordering::Relaxed); })); } for h in handles { h.join().unwrap(); } }); (total.load(Ordering::Relaxed), start.elapsed().as_micros()) } fn bench_primes_tokio_current() -> (u64, u128) { let total = Arc::new(AtomicU64::new(0)); let t2 = total.clone(); let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); let start = Instant::now(); let local = tokio::task::LocalSet::new(); local.block_on(&rt, async move { let mut handles = Vec::new(); for w in 0..PRIME_WORKERS { let (lo, hi) = primes_slice(w); let tc = t2.clone(); handles.push(tokio::task::spawn_local(async move { tc.fetch_add(count_primes(lo, hi), Ordering::Relaxed); })); } for h in handles { let _ = h.await; } }); (total.load(Ordering::Relaxed), start.elapsed().as_micros()) } fn bench_primes_tokio_multi() -> (u64, u128) { let total = Arc::new(AtomicU64::new(0)); let t2 = total.clone(); let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(available_threads()) .build() .unwrap(); let start = Instant::now(); rt.block_on(async move { let mut handles = Vec::new(); for w in 0..PRIME_WORKERS { let (lo, hi) = primes_slice(w); let tc = t2.clone(); handles.push(tokio::spawn(async move { tc.fetch_add(count_primes(lo, hi), Ordering::Relaxed); })); } for h in handles { let _ = h.await; } }); (total.load(Ordering::Relaxed), start.elapsed().as_micros()) } // --------------------------------------------------------------------------- // 4. ping_pong_oneshot — 1000 rounds of spawn-pair-await // --------------------------------------------------------------------------- const PP_ROUNDS: u64 = 1_000; fn bench_pp_smarm(threads: usize) -> (u64, u128) { let start = Instant::now(); smarm::runtime::init(bench_cfg(threads)).run(|| { for _ in 0..PP_ROUNDS { // smarm has no oneshot, so use a channel<()> per round — both // sides spawn, A sends ping, B replies pong, A joins B. let (tx_ping, rx_ping) = smarm::channel::<()>(); let (tx_pong, rx_pong) = smarm::channel::<()>(); let hb = smarm::spawn(move || { rx_ping.recv().unwrap(); tx_pong.send(()).unwrap(); }); let ha = smarm::spawn(move || { tx_ping.send(()).unwrap(); rx_pong.recv().unwrap(); }); ha.join().unwrap(); hb.join().unwrap(); } }); (PP_ROUNDS, start.elapsed().as_micros()) } fn bench_pp_tokio_current() -> (u64, u128) { let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); let start = Instant::now(); let local = tokio::task::LocalSet::new(); local.block_on(&rt, async move { for _ in 0..PP_ROUNDS { let (tx1, rx1) = tokio::sync::oneshot::channel::<()>(); let (tx2, rx2) = tokio::sync::oneshot::channel::<()>(); let hb = tokio::task::spawn_local(async move { rx1.await.unwrap(); tx2.send(()).unwrap(); }); let ha = tokio::task::spawn_local(async move { tx1.send(()).unwrap(); rx2.await.unwrap(); }); let _ = ha.await; let _ = hb.await; } }); (PP_ROUNDS, start.elapsed().as_micros()) } fn bench_pp_tokio_multi() -> (u64, u128) { let rt = tokio::runtime::Builder::new_multi_thread() .worker_threads(available_threads()) .build() .unwrap(); let start = Instant::now(); rt.block_on(async move { for _ in 0..PP_ROUNDS { let (tx1, rx1) = tokio::sync::oneshot::channel::<()>(); let (tx2, rx2) = tokio::sync::oneshot::channel::<()>(); let hb = tokio::spawn(async move { rx1.await.unwrap(); tx2.send(()).unwrap(); }); let ha = tokio::spawn(async move { tx1.send(()).unwrap(); rx2.await.unwrap(); }); let _ = ha.await; let _ = hb.await; } }); (PP_ROUNDS, start.elapsed().as_micros()) } // --------------------------------------------------------------------------- // main // --------------------------------------------------------------------------- // --------------------------------------------------------------------------- // Knob helper — reads SMARM_ALLOC_INTERVAL / SMARM_TIMESLICE_CYCLES env vars // so the sweep script can override the preemption knobs without recompiling. // --------------------------------------------------------------------------- fn bench_cfg(threads: usize) -> smarm::runtime::Config { let mut cfg = smarm::runtime::Config::exact(threads); if let Ok(v) = std::env::var("SMARM_ALLOC_INTERVAL") { if let Ok(n) = v.parse::() { cfg = cfg.alloc_interval(n); } } if let Ok(v) = std::env::var("SMARM_TIMESLICE_CYCLES") { if let Ok(n) = v.parse::() { cfg = cfg.timeslice_cycles(n); } } cfg } fn main() { let n = available_threads(); println!("smarm general benchmarks"); println!("available parallelism: {n} threads"); println!("ITERS={ITERS} (+1 warmup, discarded)"); println!( "CHAIN_DEPTH={CHAIN_DEPTH}, YIELD_TASKS={YIELD_TASKS}×{YIELD_ROUNDS}, \ PRIME_N={PRIME_N}/{PRIME_WORKERS} workers, PP_ROUNDS={PP_ROUNDS}" ); // ---- 1. chained_spawn ---- print_header(&format!("chained_spawn: depth {CHAIN_DEPTH}")); run_n("smarm 1-thread", ITERS, || bench_chained_smarm(1)); run_n(&format!("smarm {n}-thread"), ITERS, || bench_chained_smarm(n)); run_n("tokio current_thread", ITERS, bench_chained_tokio_current); run_n("tokio multi-thread", ITERS, bench_chained_tokio_multi); // ---- 2. yield_many ---- print_header(&format!("yield_many: {YIELD_TASKS} tasks × {YIELD_ROUNDS} yields")); run_n("smarm 1-thread", ITERS, || bench_yield_smarm(1)); run_n(&format!("smarm {n}-thread"), ITERS, || bench_yield_smarm(n)); run_n("tokio current_thread", ITERS, bench_yield_tokio_current); run_n("tokio multi-thread", ITERS, bench_yield_tokio_multi); // ---- 3. fan_out_compute ---- print_header(&format!("fan_out_compute: primes in [2, {PRIME_N}) across {PRIME_WORKERS}")); run_n("smarm 1-thread", ITERS, || bench_primes_smarm(1)); run_n(&format!("smarm {n}-thread"), ITERS, || bench_primes_smarm(n)); run_n("tokio current_thread", ITERS, bench_primes_tokio_current); run_n("tokio multi-thread", ITERS, bench_primes_tokio_multi); // ---- 4. ping_pong_oneshot ---- print_header(&format!("ping_pong_oneshot: {PP_ROUNDS} rounds")); run_n("smarm 1-thread", ITERS, || bench_pp_smarm(1)); run_n(&format!("smarm {n}-thread"), ITERS, || bench_pp_smarm(n)); run_n("tokio current_thread", ITERS, bench_pp_tokio_current); run_n("tokio multi-thread", ITERS, bench_pp_tokio_multi); }