//! Benchmarks where smarm's design has a structural advantage.
//!
//! These exist to show what the green-thread + stackful model buys you. The
//! single-thread numbers are the most interesting ones — they isolate the
//! per-switch / per-task cost from any contention story.
//!
//! Workloads:
//!   9.  deep_recursion       — actor recurses 1000 deep then returns. In
//!                              smarm this is plain stack recursion on the
//!                              growable mmap'd stack. In tokio, async fn
//!                              can't directly recurse — each level must
//!                              `Box::pin` its future. We measure both.
//!   10. yield_in_hot_loop    — 2 actors ping yield_now back and forth 500k
//!                              times. Pure context-switch cost; no
//!                              channels, no allocation, no contention.
//!                              Smarm's switch is ~6 GPRs + xmm save and a
//!                              `ret`; tokio's is poll → state-machine →
//!                              schedule.
//!   11. uncontended_channel  — single producer, single consumer, 1M msgs,
//!                              single-threaded runtime. With no
//!                              cross-thread contention, smarm's
//!                              Arc<Mutex<>> channel is essentially free,
//!                              and the green-thread switch should beat
//!                              tokio's future polling overhead.
//!   12. catch_unwind_panics  — spawn 10k tasks; half panic, half succeed.
//!                              Supervisor handles each. Exploratory — if
//!                              there's no real gap, drop this one.

use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use std::time::Instant;

// ---------------------------------------------------------------------------
// Shared harness
// ---------------------------------------------------------------------------

const ITERS: u32 = 15;

fn available_threads() -> usize {
    std::thread::available_parallelism().map(|n| n.get()).unwrap_or(1)
}

fn print_header(title: &str) {
    println!("\n{}", "=".repeat(80));
    println!("  {title}");
    println!("{}", "=".repeat(80));
    println!(
        "{:>26} | {:>12} | {:>10} | {:>10} | {:>10}",
        "runtime", "result", "median µs", "min µs", "max µs"
    );
    println!("{}", "-".repeat(80));
}

fn run_n<F: FnMut() -> (u64, u128)>(name: &str, n: u32, mut f: F) {
    let mut times = Vec::new();
    let mut last = 0u64;
    let _ = f(); // warmup
    for _ in 0..n {
        let (v, t) = f();
        times.push(t);
        last = v;
    }
    times.sort_unstable();
    let median = times[times.len() / 2];
    let min = *times.iter().min().unwrap();
    let max = *times.iter().max().unwrap();
    println!(
        "{:>26} | {:>12} | {:>10} | {:>10} | {:>10}",
        name, last, median, min, max
    );
}

// ---------------------------------------------------------------------------
// 9. deep_recursion — 1000 levels deep
// ---------------------------------------------------------------------------

// Each recursive frame holds an `&AtomicU64`, a `u64`, plus prologue/spill —
// conservatively ~64 B/frame on release. Smarm actor stacks are a fixed 64 KiB,
// so 500 levels (~32 KiB) leaves comfortable headroom while still being deep
// enough to exercise the stack-growth advantage over Box::pin recursion.
const RECURSE_DEPTH: u64 = 500;

fn bench_recurse_smarm(threads: usize) -> (u64, u128) {
    let total = Arc::new(AtomicU64::new(0));
    let t2 = total.clone();
    let start = Instant::now();
    smarm::runtime::init(bench_cfg(threads)).run(move || {
        // Plain Rust recursion on the actor's own (growable) stack.
        fn recurse(c: &AtomicU64, n: u64) -> u64 {
            if n == 0 {
                c.fetch_add(1, Ordering::Relaxed);
                0
            } else {
                1 + recurse(c, n - 1)
            }
        }
        let h = smarm::spawn(move || {
            let _ = recurse(&t2, RECURSE_DEPTH);
        });
        h.join().unwrap();
    });
    (total.load(Ordering::Relaxed), start.elapsed().as_micros())
}

fn bench_recurse_tokio_current() -> (u64, u128) {
    let counter = Arc::new(AtomicU64::new(0));
    let c2 = counter.clone();
    let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
    let start = Instant::now();
    let local = tokio::task::LocalSet::new();
    local.block_on(&rt, async move {
        // async fn can't self-recurse; each level returns a Box::pin'd future.
        // This is the canonical workaround a real user would write.
        fn recurse(
            c: Arc<AtomicU64>,
            n: u64,
        ) -> std::pin::Pin<Box<dyn std::future::Future<Output = u64>>> {
            Box::pin(async move {
                if n == 0 {
                    c.fetch_add(1, Ordering::Relaxed);
                    0
                } else {
                    1 + recurse(c, n - 1).await
                }
            })
        }
        let h = tokio::task::spawn_local(async move {
            let _ = recurse(c2, RECURSE_DEPTH).await;
        });
        let _ = h.await;
    });
    (counter.load(Ordering::Relaxed), start.elapsed().as_micros())
}

fn bench_recurse_tokio_multi() -> (u64, u128) {
    let counter = Arc::new(AtomicU64::new(0));
    let c2 = counter.clone();
    let rt = tokio::runtime::Builder::new_multi_thread()
        .worker_threads(available_threads())
        .build()
        .unwrap();
    let start = Instant::now();
    rt.block_on(async move {
        fn recurse(
            c: Arc<AtomicU64>,
            n: u64,
        ) -> std::pin::Pin<Box<dyn std::future::Future<Output = u64> + Send>> {
            Box::pin(async move {
                if n == 0 {
                    c.fetch_add(1, Ordering::Relaxed);
                    0
                } else {
                    1 + recurse(c, n - 1).await
                }
            })
        }
        let h = tokio::spawn(async move {
            let _ = recurse(c2, RECURSE_DEPTH).await;
        });
        let _ = h.await;
    });
    (counter.load(Ordering::Relaxed), start.elapsed().as_micros())
}

// ---------------------------------------------------------------------------
// 10. yield_in_hot_loop — 2 actors, 500k yields each, single thread
// ---------------------------------------------------------------------------

const HOT_YIELDS: u64 = 500_000;

fn bench_hot_smarm() -> (u64, u128) {
    let start = Instant::now();
    smarm::runtime::init(bench_cfg(1)).run(|| {
        let ha = smarm::spawn(|| {
            for _ in 0..HOT_YIELDS {
                smarm::yield_now();
            }
        });
        let hb = smarm::spawn(|| {
            for _ in 0..HOT_YIELDS {
                smarm::yield_now();
            }
        });
        ha.join().unwrap();
        hb.join().unwrap();
    });
    (HOT_YIELDS * 2, start.elapsed().as_micros())
}

fn bench_hot_tokio_current() -> (u64, u128) {
    let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
    let start = Instant::now();
    let local = tokio::task::LocalSet::new();
    local.block_on(&rt, async move {
        let ha = tokio::task::spawn_local(async move {
            for _ in 0..HOT_YIELDS {
                tokio::task::yield_now().await;
            }
        });
        let hb = tokio::task::spawn_local(async move {
            for _ in 0..HOT_YIELDS {
                tokio::task::yield_now().await;
            }
        });
        let _ = ha.await;
        let _ = hb.await;
    });
    (HOT_YIELDS * 2, start.elapsed().as_micros())
}

// ---------------------------------------------------------------------------
// 11. uncontended_channel — 1 producer, 1 consumer, 1M msgs, single-threaded
// ---------------------------------------------------------------------------

const UNCONT_MSGS: u64 = 1_000_000;

fn bench_unc_smarm() -> (u64, u128) {
    let start = Instant::now();
    smarm::runtime::init(bench_cfg(1)).run(|| {
        let (tx, rx) = smarm::channel::<u64>();
        let consumer = smarm::spawn(move || {
            let mut count = 0u64;
            while let Ok(_) = rx.recv() {
                count += 1;
            }
            let _ = count; // discard; run() closure must return ()
        });
        let producer = smarm::spawn(move || {
            for i in 0..UNCONT_MSGS {
                tx.send(i).unwrap();
            }
            // tx drops here, closing the channel.
        });
        producer.join().unwrap();
        let _ = consumer.join().unwrap();
    });
    (UNCONT_MSGS, start.elapsed().as_micros())
}

fn bench_unc_tokio_current() -> (u64, u128) {
    let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
    let start = Instant::now();
    let local = tokio::task::LocalSet::new();
    local.block_on(&rt, async move {
        let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel::<u64>();
        let consumer = tokio::task::spawn_local(async move {
            let mut count = 0u64;
            while let Some(_) = rx.recv().await {
                count += 1;
            }
            count
        });
        let producer = tokio::task::spawn_local(async move {
            for i in 0..UNCONT_MSGS {
                tx.send(i).unwrap();
            }
        });
        let _ = producer.await;
        let _ = consumer.await;
    });
    (UNCONT_MSGS, start.elapsed().as_micros())
}

// ---------------------------------------------------------------------------
// 12. catch_unwind_panics — 10k tasks, half panic
// ---------------------------------------------------------------------------

const PANIC_TASKS: u64 = 10_000;

fn bench_panic_smarm(threads: usize) -> (u64, u128) {
    let ok = Arc::new(AtomicU64::new(0));
    let err = Arc::new(AtomicU64::new(0));
    let ok2 = ok.clone();
    let err2 = err.clone();
    let start = Instant::now();
    smarm::runtime::init(bench_cfg(threads)).run(move || {
        let mut handles = Vec::new();
        for i in 0..PANIC_TASKS {
            handles.push(smarm::spawn(move || {
                if i % 2 == 0 {
                    panic!("planned");
                }
            }));
        }
        for h in handles {
            match h.join() {
                Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
                Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
            }
        }
    });
    let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
    (total, start.elapsed().as_micros())
}

fn bench_panic_tokio_current() -> (u64, u128) {
    let ok = Arc::new(AtomicU64::new(0));
    let err = Arc::new(AtomicU64::new(0));
    let ok2 = ok.clone();
    let err2 = err.clone();
    let rt = tokio::runtime::Builder::new_current_thread().build().unwrap();
    let start = Instant::now();
    let local = tokio::task::LocalSet::new();
    local.block_on(&rt, async move {
        let mut handles = Vec::new();
        for i in 0..PANIC_TASKS {
            handles.push(tokio::task::spawn_local(async move {
                if i % 2 == 0 {
                    panic!("planned");
                }
            }));
        }
        for h in handles {
            match h.await {
                Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
                Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
            }
        }
    });
    let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
    (total, start.elapsed().as_micros())
}

fn bench_panic_tokio_multi() -> (u64, u128) {
    let ok = Arc::new(AtomicU64::new(0));
    let err = Arc::new(AtomicU64::new(0));
    let ok2 = ok.clone();
    let err2 = err.clone();
    let rt = tokio::runtime::Builder::new_multi_thread()
        .worker_threads(available_threads())
        .build()
        .unwrap();
    let start = Instant::now();
    rt.block_on(async move {
        let mut handles = Vec::new();
        for i in 0..PANIC_TASKS {
            handles.push(tokio::spawn(async move {
                if i % 2 == 0 {
                    panic!("planned");
                }
            }));
        }
        for h in handles {
            match h.await {
                Ok(()) => { ok2.fetch_add(1, Ordering::Relaxed); }
                Err(_) => { err2.fetch_add(1, Ordering::Relaxed); }
            }
        }
    });
    let total = ok.load(Ordering::Relaxed) + err.load(Ordering::Relaxed);
    (total, start.elapsed().as_micros())
}

// ---------------------------------------------------------------------------
// main
// ---------------------------------------------------------------------------


// ---------------------------------------------------------------------------
// Knob helper — reads SMARM_ALLOC_INTERVAL / SMARM_TIMESLICE_CYCLES env vars
// so the sweep script can override the preemption knobs without recompiling.
// ---------------------------------------------------------------------------

fn bench_cfg(threads: usize) -> smarm::runtime::Config {
    let mut cfg = smarm::runtime::Config::exact(threads);
    if let Ok(v) = std::env::var("SMARM_ALLOC_INTERVAL") {
        if let Ok(n) = v.parse::<u32>() { cfg = cfg.alloc_interval(n); }
    }
    if let Ok(v) = std::env::var("SMARM_TIMESLICE_CYCLES") {
        if let Ok(n) = v.parse::<u64>() { cfg = cfg.timeslice_cycles(n); }
    }
    cfg
}

fn main() {
    let n = available_threads();
    println!("smarm smarm-favored benchmarks");
    println!("available parallelism: {n} threads");
    println!("ITERS={ITERS} (+1 warmup, discarded)");
    println!(
        "RECURSE_DEPTH={RECURSE_DEPTH}, HOT_YIELDS={HOT_YIELDS}×2, \
         UNCONT_MSGS={UNCONT_MSGS}, PANIC_TASKS={PANIC_TASKS}"
    );

    // ---- 9. deep_recursion ----
    print_header(&format!("deep_recursion: depth {RECURSE_DEPTH}"));
    run_n("smarm 1-thread", ITERS, || bench_recurse_smarm(1));
    run_n(&format!("smarm {n}-thread"), ITERS, || bench_recurse_smarm(n));
    run_n("tokio current_thread", ITERS, bench_recurse_tokio_current);
    run_n("tokio multi-thread", ITERS, bench_recurse_tokio_multi);

    // ---- 10. yield_in_hot_loop ----
    print_header(&format!("yield_in_hot_loop: 2 actors × {HOT_YIELDS} yields (single thread)"));
    run_n("smarm 1-thread", ITERS, bench_hot_smarm);
    run_n("tokio current_thread", ITERS, bench_hot_tokio_current);

    // ---- 11. uncontended_channel ----
    print_header(&format!("uncontended_channel: 1→1, {UNCONT_MSGS} msgs (single thread)"));
    run_n("smarm 1-thread", ITERS, bench_unc_smarm);
    run_n("tokio current_thread", ITERS, bench_unc_tokio_current);

    // ---- 12. catch_unwind_panics ----
    print_header(&format!("catch_unwind_panics: {PANIC_TASKS} tasks, 50% panic"));
    run_n("smarm 1-thread", ITERS, || bench_panic_smarm(1));
    run_n(&format!("smarm {n}-thread"), ITERS, || bench_panic_smarm(n));
    run_n("tokio current_thread", ITERS, bench_panic_tokio_current);
    run_n("tokio multi-thread", ITERS, bench_panic_tokio_multi);
}