From 978678a46e6b1c450dba5667e1a82eb0d8c331a4 Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 23 May 2026 16:09:35 +0000 Subject: [PATCH] feat: full runtime redesign (v0.6) Complete rewrite with improved architecture & correctness: - src/runtime.rs: Simplified task scheduling with proper state transitions - src/scheduler.rs: Decoupled from runtime, pure task queue logic - src/io.rs, src/mutex.rs: Refactored for clarity & performance - New actor model framework (src/actor.rs, src/context.rs) - Channel primitives (src/channel.rs) & process IDs (src/pid.rs) - Preemption framework (src/preempt.rs) for fair timeslicing - Expanded benchmarks & tests (multi_scheduler, primes, runtime) --- .gitignore | 2 + Cargo.toml | 31 ++ LOOM.md | 210 +++++++++++ README.md | 82 +++++ benches/multi_scheduler.rs | 343 ++++++++++++++++++ benches/primes.rs | 134 +++++++ src/actor.rs | 110 ++++++ src/channel.rs | 153 ++++++++ src/context.rs | 106 ++++++ src/io.rs | 520 +++++++++++++++++++++++++++ src/lib.rs | 60 ++++ src/mutex.rs | 248 +++++++++++++ src/pid.rs | 38 ++ src/preempt.rs | 129 +++++++ src/runtime.rs | 718 +++++++++++++++++++++++++++++++++++++ src/scheduler.rs | 349 ++++++++++++++++++ src/stack.rs | 89 +++++ src/supervisor.rs | 37 ++ src/timer.rs | 147 ++++++++ src/trace.rs | 246 +++++++++++++ tests/channel.rs | 110 ++++++ tests/context.rs | 137 +++++++ tests/io.rs | 99 +++++ tests/io_epoll.rs | 324 +++++++++++++++++ tests/mutex.rs | 314 ++++++++++++++++ tests/pid.rs | 22 ++ tests/preempt.rs | 66 ++++ tests/runtime.rs | 426 ++++++++++++++++++++++ tests/scheduler.rs | 171 +++++++++ tests/stack.rs | 123 +++++++ tests/timer.rs | 207 +++++++++++ 31 files changed, 5751 insertions(+) create mode 100644 .gitignore create mode 100644 Cargo.toml create mode 100644 LOOM.md create mode 100644 README.md create mode 100644 benches/multi_scheduler.rs create mode 100644 benches/primes.rs create mode 100644 src/actor.rs create mode 100644 src/channel.rs create mode 100644 src/context.rs create mode 100644 src/io.rs create mode 100644 src/lib.rs create mode 100644 src/mutex.rs create mode 100644 src/pid.rs create mode 100644 src/preempt.rs create mode 100644 src/runtime.rs create mode 100644 src/scheduler.rs create mode 100644 src/stack.rs create mode 100644 src/supervisor.rs create mode 100644 src/timer.rs create mode 100644 src/trace.rs create mode 100644 tests/channel.rs create mode 100644 tests/context.rs create mode 100644 tests/io.rs create mode 100644 tests/io_epoll.rs create mode 100644 tests/mutex.rs create mode 100644 tests/pid.rs create mode 100644 tests/preempt.rs create mode 100644 tests/runtime.rs create mode 100644 tests/scheduler.rs create mode 100644 tests/stack.rs create mode 100644 tests/timer.rs diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..0df0293 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,31 @@ +[package] +name = "smarm" +version = "0.3.0" +edition = "2021" +rust-version = "1.95" + +[features] +smarm-trace = [] + +[dependencies] +libc = "0.2" + +[dev-dependencies] +libc = "0.2" +tokio = { version = "1", features = ["rt", "rt-multi-thread", "macros", "sync"] } + +[profile.dev] +panic = "unwind" + +[profile.release] +panic = "unwind" +lto = "thin" +codegen-units = 1 + +[[bench]] +name = "primes" +harness = false + +[[bench]] +name = "multi_scheduler" +harness = false diff --git a/LOOM.md b/LOOM.md new file mode 100644 index 0000000..179143c --- /dev/null +++ b/LOOM.md @@ -0,0 +1,210 @@ +# Loom + +> Erlang-style actor concurrency for Rust, without the copies, the colors, or the GC pauses. + +--- + +## Vision + +Rust gives you the right ownership discipline for safe actor concurrency almost for free — `Send` already +draws the boundary, the borrow checker already enforces it. What it lacks is an execution model to match: +async/await is IO-centric, colors your functions, and trades stack simplicity for state-machine complexity; +OS threads are too heavy to spawn per actor. + +Loom adds a third option: **green-thread actors on a shared heap**, scheduled cooperatively, with +message-passing as the only cross-actor communication primitive. You get Erlang's isolation model without +Erlang's copying GC, and you get Rust's zero-copy ownership transfers without async's cognitive overhead. +No function coloring. No `Box`. Just actors, messages, and the borrow checker doing what it +already does. + +--- + +## Do: Core Runtime + +### Actors and scheduling + +Each actor is a lightweight green thread with its own heap-allocated, growable stack. Stacks are +allocated via `mmap` with a guard page below the region; overflow is detected by the OS without Loom +polling for it. Initial stacks are small and grow by remapping on demand. + +The scheduler runs one OS thread per CPU. Each scheduler thread loops against a single global +`Mutex` queue shared across all schedulers. If queue contention becomes a measured bottleneck +this can be revisited; the interface will not change. + +Loom requires `panic = unwind`. Users who set `panic = abort` accept that supervision and actor +isolation are silently degraded to process death. + +### Process descriptor + +Each actor has a descriptor that is hot while the actor runs and will typically live in L1 cache. +It holds: + +- `stack_base: *mut u8` — bottom of the allocated stack region +- `stack_cap: usize` — total allocated size +- `stack_ptr: *mut u8` — current stack pointer (`rsp`), saved on yield +- `pid: (u32, u32)` — index and generation counter (see PIDs below) +- `alloc_count: u32` — countdown for preemption sampling +- `timeslice_start: u64` — `RDTSC` value written on every resume +- `resize_count: u16` — diagnostic counter for stack growth events +- `context: *mut ContextSaveArea` — pointer to the register save area (cold, touched only on switch) + +### Context switching + +Context switching is implemented in a `#[naked]` assembly shim, one per supported architecture. +The compiler cannot be asked to switch stacks. + +**Suspend** (yield, preemption, or blocking): +1. Save callee-saved integer registers and SIMD registers into `ContextSaveArea`. +2. Save `rsp`/`sp` into the process descriptor. +3. Load the scheduler's stack pointer from a thread-local and jump back into the scheduler loop. + +**Resume**: +1. Load `rsp`/`sp` from the process descriptor. +2. Restore registers from `ContextSaveArea`. +3. `ret` — the return address is already on the restored stack, execution resumes exactly where the + actor yielded. + +**x86-64**: saves `rbx`, `rbp`, `r12`–`r15` (6 × 8 = 48 bytes) and `xmm0`–`xmm15` (16 × 16 = 256 +bytes) = 304 bytes total. Full SSE baseline is required; the compiler may autovectorise freely. +AVX-512 is deferred. + +**ARM64**: saves `x19`–`x30` (12 × 8 = 96 bytes, including the link register `x30` which must be +saved explicitly — it holds the return address, unlike x86 where `call` pushes it to the stack) and +`d8`–`d15` (8 × 8 = 64 bytes) = 160 bytes total. + +`ContextSaveArea` is a `Box` per actor. Lifetime equals the actor's lifetime; +no churn, no bulk deallocation, `Box` is correct. + +Initial platform target is x86-64 Linux. ARM64 and macOS are natural follow-ons. + +### Allocator-driven preemption + +Every Nth allocation, the allocator reads `RDTSC` and compares it against `timeslice_start`. If the +threshold is exceeded the actor yields. The workloads that starve a scheduler — sustained compute, +data transformation — are precisely the ones doing frequent allocations, so this approximation is +correct by construction. + +`RDTSC` is not monotonic across core migration; a slightly wrong timeslice is acceptable. Loom is +not a real-time scheduler. + +Known failure mode: tight no-alloc loops are invisible to this mechanism. Actors doing sustained +allocation-free compute must call `loom::yield_now()` explicitly, or offload to a thread pool +outside the actor scheduler (e.g. rayon). This is documented and acceptable — such loops are rare +in message-passing workloads. + +### Yield points + +An actor yields at: + +- **Channel send/recv** — the primary communication primitive +- **Mutex contention** — attempting to lock a held `Arc>` parks the actor +- **IO** — blocking on a socket or file descriptor parks the actor until the IO thread signals readiness +- **`loom::sleep(duration)`** — parks the actor; the timer wheel re-queues it on expiry +- **`loom::yield_now()`** — explicit cooperative yield +- **Allocator preemption** — as above +- **Spawn** — does not yield by default; the new actor is queued and the spawner continues + +`std::thread::sleep` inside an actor blocks the entire OS thread and should never be used. Loom +may emit a warning if it can detect this. + +### IO thread + +A single dedicated IO thread runs an `epoll`/`kqueue` loop. Actors blocking on IO register their +file descriptor and PID; the IO thread moves them back into the global queue when the fd is ready. +A `HashMap` maps fds to parked actors. Cancellation (actor dies while waiting on IO) +deregisters the fd. This is intentionally simple and not pluggable; Loom is not a general async +executor. + +### Communication + +Messages must be `Send` or `Copy`. Non-`Send` types cannot cross an actor boundary; this is +enforced by the type system with no runtime overhead. + +Two primitives only: + +- **Move** — transfer owned data across a channel. Zero copy. The sender relinquishes ownership + at the type level. This is the default. +- **`Arc>`** — for genuinely shared long-lived state. Explicit and visible. + +Cross-actor `Rc` or bare pointers are banned. There is no cycle detector. Cross-actor cycles are +banned by construction: either transfer ownership or use `Arc`. + +### PIDs + +A PID is a `(index, generation)` pair. The index may be reused after an actor dies; the generation +counter increments on every death. A stale handle holding the wrong generation is a detectable +error, not a silent misdirection. This avoids the ABA problem without reserving PID space forever. + +### Supervision + +Every actor has a supervisor, assigned at spawn. This is not optional. The root supervisor is +provided by the runtime; its death is a process exit. + +A supervisor receives one of three signals when a child actor terminates: + +- `Signal::Exit(pid)` — normal completion +- `Signal::Panic(pid, payload)` — caught via `catch_unwind` at the actor entry point boundary, + before unwinding can reach the assembly shim +- `Signal::Timeout(pid)` — actor exceeded a budget (see below) + +The supervisor decides: restart the actor, escalate to its own supervisor, or ignore. Restart +intensity is capped: if an actor panics more than N times within a time window, the supervisor +stops restarting and escalates. This prevents a bad prelude or corrupted input from spinning the +supervisor in a restart loop indefinitely. N and the window are configurable per supervisor with a +sensible global default. + +### Mutex timeout + +Every `loom::mutex` lock attempt is mediated by the scheduler. If the lock is not acquired within +a configurable timeout, the actor receives a `LockTimeout` error rather than parking forever. This +is a hard runtime guarantee, not a convention. Default timeout is global and configurable; +individual locks and individual call sites can override it. + +### Task joining + +Actors can spawn children and wait on a group of handles: + +```rust +let h1 = loom::spawn(|| compute_a()); +let h2 = loom::spawn(|| compute_b()); +let (a, b) = loom::join!(h1, h2); +``` + +`join!` parks the calling actor until all handles complete. The last child to finish re-queues the +parent. This is a countdown in the parent's descriptor; no polling, no waker registration. A +`join_timeout!` variant is a natural extension. + +### Timer wheel + +`loom::sleep` and supervision timeouts are driven by a timer wheel in the scheduler. Sleeping +actors are parked and re-queued by the timer thread on expiry. The timer wheel is internal +infrastructure; its design is an implementation detail. + +--- + +## Defer: Later Work + +- **Stack sizing policy** — initial size, growth factor, and whether stacks ever shrink are + implementation decisions to be made with profiling data, not up front. +- **Queue contention** — if `Mutex` proves to be a bottleneck under profiling, evaluate + `DashMap` or a lock-free work-stealing deque (e.g. `crossbeam-deque`). Not before. +- **AVX-512 context save** — extend `ContextSaveArea` when there is a concrete use case. +- **`loom::sleep` vs raw sleep semantics** — further control knobs deferred until the basic sleep + is working and real use cases are understood. +- **Supervision tree API** — the contract is defined; the recursive hierarchy, restart strategies, + and introspection API are implementation work. +- **no_std support** — the assembly shim is no_std friendly but the IO thread and allocator require + OS primitives. Target is no_std + `alloc` on hosted platforms; bare metal is out of scope. +- **Distribution** — Loom is a single-process runtime. No distribution protocol, no BEAM-style + clustering. + +--- + +## What Loom is Not + +- Not a drop-in replacement for Tokio. Loom does not implement `Future` or the async executor interface. +- Not a general allocator. Loom manages actor stacks; heap allocation for actor data goes through + the system allocator. +- Not Erlang. No hot code reloading, no distribution protocol, no BEAM bytecode. Loom is a + concurrency runtime, not a platform. +- Not a real-time scheduler. Timeslice accuracy is best-effort. diff --git a/README.md b/README.md new file mode 100644 index 0000000..1b4a5c7 --- /dev/null +++ b/README.md @@ -0,0 +1,82 @@ +# smarm + +> Silly Marks Abstract Rust Machine. A prototype green-thread actor runtime for Rust. + +Implements the core ideas in [`LOOM.md`](./LOOM.md): green-thread actors on a +shared heap, scheduled cooperatively, communicating only by `Send` messages. +Erlang's isolation model without Erlang's copying GC, Rust's zero-copy +ownership transfers without async's function colouring. + +The scheduler is multi-threaded — one OS thread per available CPU, all drawing +from a shared run queue. The single-threaded `run()` entry point is kept as a +convenience wrapper around `runtime::init(Config::exact(1)).run(f)`. + +## What's here + +| Module | What it does | +|--------------|------------------------------------------------------------------------| +| `stack` | `mmap`'d growable stack with guard page; SIGSEGV on overflow | +| `context` | `#[naked]` x86-64 context-switch shims, callee-saved regs only | +| `preempt` | Allocator-driven preemption; `check!()` macro for no-alloc loops | +| `pid` | `(index, generation)` PIDs; stale handles are detectable, not silent | +| `actor` | Trampoline + `catch_unwind` boundary at the actor entry point | +| `scheduler` | Run queue, slot table, spawn/join, parking, idle path | +| `channel` | Unbounded MPSC channel; `recv` parks the actor | +| `mutex` | `Mutex` with mandatory timeout; FIFO waiters; parks the green thread | +| `timer` | Min-heap of `(deadline, reason)`; `Sleep` and `WaitTimeout` reasons | +| `io` | `block_on_io` for blocking work; `wait_readable`/`wait_writable` + `read`/`write` via epoll | +| `supervisor` | `Signal::Exit` / `Signal::Panic` delivered to a parent actor's mailbox | + +## Quick taste + +```rust +use smarm::{run, spawn, channel}; + +run(|| { + let (tx, rx) = channel::(); + let h = spawn(move || { + for _ in 0..3 { + let v = rx.recv().unwrap(); + println!("got {v}"); + } + }); + for v in 1..=3i64 { + tx.send(v).unwrap(); + } + h.join().unwrap(); +}); +``` + +## Layout + +``` +src/ + stack.rs context.rs preempt.rs pid.rs actor.rs + scheduler.rs channel.rs mutex.rs timer.rs io.rs supervisor.rs + lib.rs +tests/ + per-module integration tests +benches/ + primes.rs fan-out/fan-in compute, vs tokio current_thread +LOOM.md design intent +``` + +## Building and running + +Standard Cargo. Requires Rust 1.95 or newer (the `#[naked]` attribute went stable +in 1.88; we use a few unrelated post-1.88 features). x86-64 Linux only — +ARM64 and macOS are on the deferred list because of the assembly shim and the +epoll dependency. + +```sh +cargo test # all tests +cargo test --test mutex # one module +cargo bench # primes benchmark vs tokio +``` + +## What's not here + +See the **Defer** section of `LOOM.md`. Notable absences: supervisor +restart-intensity caps, `join!` for handle groups, stack growth via remap, +hierarchical timer wheel, fd-wait timeouts, `Signal::Timeout`. Each is +mechanism we know how to add; none belongs in this iteration. diff --git a/benches/multi_scheduler.rs b/benches/multi_scheduler.rs new file mode 100644 index 0000000..5771e73 --- /dev/null +++ b/benches/multi_scheduler.rs @@ -0,0 +1,343 @@ +//! Benchmarks for the multi-scheduler runtime. +//! +//! Three workloads, three runtimes: +//! - smarm single-thread (exact = 1) +//! - smarm multi-thread (exact = available_parallelism) +//! - tokio current_thread (single-thread baseline) +//! - tokio multi-thread (the parallel comparison) +//! +//! Workloads: +//! 1. Fan-out / fan-in compute (primes) — CPU-bound, tests parallelism +//! 2. Ping-pong — message-passing overhead, park/unpark cost +//! 3. Spawn throughput — cost of spawn + join per actor + +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Instant; + +// --------------------------------------------------------------------------- +// Shared helpers +// --------------------------------------------------------------------------- + +fn available_threads() -> usize { + std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1) +} + +fn print_header(title: &str) { + println!("\n{}", "=".repeat(80)); + println!(" {title}"); + println!("{}", "=".repeat(80)); + println!( + "{:>22} | {:>12} | {:>10} | {:>10} | {:>10}", + "runtime", "result", "median µs", "min µs", "max µs" + ); + println!("{}", "-".repeat(80)); +} + +fn run_n (u64, u128)>(name: &str, n: u32, mut f: F) { + let mut times = Vec::new(); + let mut last = 0u64; + for _ in 0..n { + let (v, t) = f(); + times.push(t); + last = v; + } + times.sort_unstable(); + let median = times[times.len() / 2]; + let min = *times.iter().min().unwrap(); + let max = *times.iter().max().unwrap(); + println!( + "{:>22} | {:>12} | {:>10} | {:>10} | {:>10}", + name, last, median, min, max + ); +} + +const ITERS: u32 = 7; + +// --------------------------------------------------------------------------- +// Workload 1: fan-out / fan-in primes +// --------------------------------------------------------------------------- + +const PRIME_N: u64 = 400_000; +const WORKERS: u64 = 64; + +fn is_prime(n: u64) -> bool { + if n < 2 { return false; } + if n < 4 { return true; } + if n % 2 == 0 { return false; } + let mut i = 3u64; + while i * i <= n { if n % i == 0 { return false; } i += 2; } + true +} + +fn count_primes(lo: u64, hi: u64) -> u64 { + (lo..hi).filter(|&n| is_prime(n)).count() as u64 +} + +fn primes_slice(w: u64) -> (u64, u64) { + let per = PRIME_N / WORKERS; + let lo = w * per; + let hi = if w + 1 == WORKERS { PRIME_N } else { lo + per }; + (lo, hi) +} + +fn bench_primes_smarm(threads: usize) -> (u64, u128) { + let total = Arc::new(AtomicU64::new(0)); + let t2 = total.clone(); + let start = Instant::now(); + smarm::runtime::init(smarm::runtime::Config::exact(threads)).run(move || { + let mut handles = Vec::new(); + for w in 0..WORKERS { + let (lo, hi) = primes_slice(w); + let tc = t2.clone(); + handles.push(smarm::spawn(move || { + tc.fetch_add(count_primes(lo, hi), Ordering::Relaxed); + })); + } + for h in handles { h.join().unwrap(); } + }); + (total.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_primes_tokio_current() -> (u64, u128) { + let total = Arc::new(AtomicU64::new(0)); + let t2 = total.clone(); + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + let start = Instant::now(); + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + let mut handles = Vec::new(); + for w in 0..WORKERS { + let (lo, hi) = primes_slice(w); + let tc = t2.clone(); + handles.push(tokio::task::spawn_local(async move { + tc.fetch_add(count_primes(lo, hi), Ordering::Relaxed); + })); + } + for h in handles { let _ = h.await; } + }); + (total.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_primes_tokio_multi() -> (u64, u128) { + let total = Arc::new(AtomicU64::new(0)); + let t2 = total.clone(); + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(available_threads()) + .build() + .unwrap(); + let start = Instant::now(); + rt.block_on(async move { + let mut handles = Vec::new(); + for w in 0..WORKERS { + let (lo, hi) = primes_slice(w); + let tc = t2.clone(); + handles.push(tokio::spawn(async move { + tc.fetch_add(count_primes(lo, hi), Ordering::Relaxed); + })); + } + for h in handles { let _ = h.await; } + }); + (total.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_primes_baseline() -> (u64, u128) { + let start = Instant::now(); + let total: u64 = (0..WORKERS).map(|w| { + let (lo, hi) = primes_slice(w); + count_primes(lo, hi) + }).sum(); + (total, start.elapsed().as_micros()) +} + +// --------------------------------------------------------------------------- +// Workload 2: channel ping-pong +// --------------------------------------------------------------------------- + +const PING_ROUNDS: u64 = 10_000; + +fn bench_pingpong_smarm(threads: usize) -> (u64, u128) { + let start = Instant::now(); + smarm::runtime::init(smarm::runtime::Config::exact(threads)).run(|| { + let (tx_a, rx_a) = smarm::channel::(); + let (tx_b, rx_b) = smarm::channel::(); + let ha = smarm::spawn(move || { + tx_a.send(0).unwrap(); + loop { + let v = rx_b.recv().unwrap(); + if v >= PING_ROUNDS { break; } + tx_a.send(v + 1).unwrap(); + } + }); + let hb = smarm::spawn(move || { + loop { + let v = rx_a.recv().unwrap(); + tx_b.send(v + 1).unwrap(); + if v + 1 >= PING_ROUNDS { break; } + } + }); + ha.join().unwrap(); + hb.join().unwrap(); + }); + (PING_ROUNDS, start.elapsed().as_micros()) +} + +fn bench_pingpong_tokio_current() -> (u64, u128) { + let rt = tokio::runtime::Builder::new_current_thread() + .enable_all() + .build() + .unwrap(); + let start = Instant::now(); + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + let (tx_a, mut rx_a) = tokio::sync::mpsc::unbounded_channel::(); + let (tx_b, mut rx_b) = tokio::sync::mpsc::unbounded_channel::(); + let ha = tokio::task::spawn_local(async move { + tx_a.send(0).unwrap(); + loop { + let v = rx_b.recv().await.unwrap(); + if v >= PING_ROUNDS { break; } + tx_a.send(v + 1).unwrap(); + } + }); + let hb = tokio::task::spawn_local(async move { + loop { + let v = rx_a.recv().await.unwrap(); + tx_b.send(v + 1).unwrap(); + if v + 1 >= PING_ROUNDS { break; } + } + }); + let _ = ha.await; + let _ = hb.await; + }); + (PING_ROUNDS, start.elapsed().as_micros()) +} + +fn bench_pingpong_tokio_multi() -> (u64, u128) { + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(2) // ping-pong only needs 2 threads + .enable_all() + .build() + .unwrap(); + let start = Instant::now(); + rt.block_on(async move { + let (tx_a, mut rx_a) = tokio::sync::mpsc::unbounded_channel::(); + let (tx_b, mut rx_b) = tokio::sync::mpsc::unbounded_channel::(); + let ha = tokio::spawn(async move { + tx_a.send(0).unwrap(); + loop { + let v = rx_b.recv().await.unwrap(); + if v >= PING_ROUNDS { break; } + tx_a.send(v + 1).unwrap(); + } + }); + let hb = tokio::spawn(async move { + loop { + let v = rx_a.recv().await.unwrap(); + tx_b.send(v + 1).unwrap(); + if v + 1 >= PING_ROUNDS { break; } + } + }); + let _ = ha.await; + let _ = hb.await; + }); + (PING_ROUNDS, start.elapsed().as_micros()) +} + +// --------------------------------------------------------------------------- +// Workload 3: spawn throughput +// --------------------------------------------------------------------------- + +const SPAWN_COUNT: u64 = 1_000; + +fn bench_spawn_smarm(threads: usize) -> (u64, u128) { + let counter = Arc::new(AtomicU64::new(0)); + let c = counter.clone(); + let start = Instant::now(); + smarm::runtime::init(smarm::runtime::Config::exact(threads)).run(move || { + let mut handles = Vec::new(); + for _ in 0..SPAWN_COUNT { + let cc = c.clone(); + handles.push(smarm::spawn(move || { + cc.fetch_add(1, Ordering::Relaxed); + })); + } + for h in handles { h.join().unwrap(); } + }); + (counter.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_spawn_tokio_current() -> (u64, u128) { + let counter = Arc::new(AtomicU64::new(0)); + let c = counter.clone(); + let rt = tokio::runtime::Builder::new_current_thread().build().unwrap(); + let start = Instant::now(); + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + let mut handles = Vec::new(); + for _ in 0..SPAWN_COUNT { + let cc = c.clone(); + handles.push(tokio::task::spawn_local(async move { + cc.fetch_add(1, Ordering::Relaxed); + })); + } + for h in handles { let _ = h.await; } + }); + (counter.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_spawn_tokio_multi() -> (u64, u128) { + let counter = Arc::new(AtomicU64::new(0)); + let c = counter.clone(); + let rt = tokio::runtime::Builder::new_multi_thread() + .worker_threads(available_threads()) + .build() + .unwrap(); + let start = Instant::now(); + rt.block_on(async move { + let mut handles = Vec::new(); + for _ in 0..SPAWN_COUNT { + let cc = c.clone(); + handles.push(tokio::spawn(async move { + cc.fetch_add(1, Ordering::Relaxed); + })); + } + for h in handles { let _ = h.await; } + }); + (counter.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +// --------------------------------------------------------------------------- +// main +// --------------------------------------------------------------------------- + +fn main() { + let n = available_threads(); + println!("smarm multi-scheduler benchmarks"); + println!("available parallelism: {n} threads"); + println!("PRIME_N={PRIME_N}, WORKERS={WORKERS}, PING_ROUNDS={PING_ROUNDS}, SPAWN_COUNT={SPAWN_COUNT}"); + + // ---- Primes ---- + print_header(&format!("Fan-out/fan-in: count primes in [2, {PRIME_N}) across {WORKERS} workers")); + run_n("baseline (serial)", ITERS, bench_primes_baseline); + run_n("smarm single-thread", ITERS, || bench_primes_smarm(1)); + run_n(&format!("smarm {n}-thread"), ITERS, || bench_primes_smarm(n)); + run_n("tokio current_thread", ITERS, bench_primes_tokio_current); + run_n("tokio multi-thread", ITERS, bench_primes_tokio_multi); + + // ---- Ping-pong ---- + print_header(&format!("Ping-pong: {PING_ROUNDS} round-trips between two actors")); + run_n("smarm single-thread", ITERS, || bench_pingpong_smarm(1)); + run_n(&format!("smarm {n}-thread"), ITERS, || bench_pingpong_smarm(n)); + run_n("tokio current_thread", ITERS, bench_pingpong_tokio_current); + run_n("tokio multi-thread", ITERS, bench_pingpong_tokio_multi); + + // ---- Spawn throughput ---- + print_header(&format!("Spawn throughput: {SPAWN_COUNT} actors spawned and joined")); + run_n("smarm single-thread", ITERS, || bench_spawn_smarm(1)); + run_n(&format!("smarm {n}-thread"), ITERS, || bench_spawn_smarm(n)); + run_n("tokio current_thread", ITERS, bench_spawn_tokio_current); + run_n("tokio multi-thread", ITERS, bench_spawn_tokio_multi); +} diff --git a/benches/primes.rs b/benches/primes.rs new file mode 100644 index 0000000..7431e87 --- /dev/null +++ b/benches/primes.rs @@ -0,0 +1,134 @@ +//! Compute-heavy fan-out/fan-in benchmark. +//! +//! Counts primes in [2, N) across W workers (each handling a contiguous +//! slice), then sums the results. Tests pure compute throughput plus the +//! cost of spawn/join/channel. Single-threaded both sides (smarm has only +//! one OS thread; tokio is configured `current_thread`). +//! +//! Run with `cargo bench`. + +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; +use std::time::Instant; + +const N: u64 = 200_000; +const WORKERS: u64 = 16; +const ITERATIONS: u32 = 5; + +fn is_prime(n: u64) -> bool { + if n < 2 { return false; } + if n < 4 { return true; } + if n % 2 == 0 { return false; } + let mut i = 3u64; + while i * i <= n { + if n % i == 0 { return false; } + i += 2; + } + true +} + +fn count_primes_in(lo: u64, hi: u64) -> u64 { + let mut count = 0u64; + for n in lo..hi { + if is_prime(n) { count += 1; } + } + count +} + +fn slice(worker: u64) -> (u64, u64) { + let per = N / WORKERS; + let lo = worker * per; + let hi = if worker + 1 == WORKERS { N } else { (worker + 1) * per }; + (lo, hi) +} + +fn bench_smarm() -> (u64, u128) { + let total = Arc::new(AtomicU64::new(0)); + let total2 = total.clone(); + let start = Instant::now(); + + smarm::run(move || { + let mut handles = Vec::new(); + for w in 0..WORKERS { + let (lo, hi) = slice(w); + let t = total2.clone(); + handles.push(smarm::spawn(move || { + let c = count_primes_in(lo, hi); + t.fetch_add(c, Ordering::Relaxed); + })); + } + for h in handles { + h.join().unwrap(); + } + }); + + (total.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_tokio() -> (u64, u128) { + let total = Arc::new(AtomicU64::new(0)); + let total2 = total.clone(); + let rt = tokio::runtime::Builder::new_current_thread() + .build() + .unwrap(); + let start = Instant::now(); + + let local = tokio::task::LocalSet::new(); + local.block_on(&rt, async move { + let mut handles = Vec::new(); + for w in 0..WORKERS { + let (lo, hi) = slice(w); + let t = total2.clone(); + handles.push(tokio::task::spawn_local(async move { + let c = count_primes_in(lo, hi); + t.fetch_add(c, Ordering::Relaxed); + })); + } + for h in handles { + let _ = h.await; + } + }); + + (total.load(Ordering::Relaxed), start.elapsed().as_micros()) +} + +fn bench_baseline() -> (u64, u128) { + let mut total = 0u64; + let start = Instant::now(); + for w in 0..WORKERS { + let (lo, hi) = slice(w); + total += count_primes_in(lo, hi); + } + (total, start.elapsed().as_micros()) +} + +fn run_n (u64, u128)>(name: &str, n: u32, mut f: F) { + let mut times = Vec::new(); + let mut last_count = 0; + for _ in 0..n { + let (c, t) = f(); + times.push(t); + last_count = c; + } + times.sort(); + let median = times[times.len() / 2]; + let min = *times.iter().min().unwrap(); + let max = *times.iter().max().unwrap(); + println!( + "{:>12} | primes: {:>6} | median: {:>8} µs | min: {:>8} µs | max: {:>8} µs", + name, last_count, median, min, max + ); +} + +fn main() { + println!( + "Counting primes in [2, {}) across {} workers, {} iterations each\n", + N, WORKERS, ITERATIONS + ); + println!("{:>12} | {:>15} | {:>16} | {:>15} | {:>15}", "runtime", "primes found", "median", "min", "max"); + println!("{}", "-".repeat(80)); + + run_n("baseline", ITERATIONS, bench_baseline); + run_n("smarm", ITERATIONS, bench_smarm); + run_n("tokio", ITERATIONS, bench_tokio); +} diff --git a/src/actor.rs b/src/actor.rs new file mode 100644 index 0000000..58ebd04 --- /dev/null +++ b/src/actor.rs @@ -0,0 +1,110 @@ +//! Actor descriptor and trampoline. +//! +//! An `Actor` owns its stack and holds the closure it will run. The +//! `trampoline` is a fixed `extern "C-unwind" fn()` that every actor enters +//! through; it pulls the closure out of a thread-local set by the scheduler +//! immediately before resume, invokes it inside `catch_unwind`, records the +//! outcome, and switches back to the scheduler. +//! +//! Why a thread-local and not, say, passing the closure pointer via a +//! register? Because the first resume goes through `ret`, not `call`, and +//! we have no other channel for parameters. The scheduler sets the +//! thread-local, switches in, the trampoline reads it. After the first +//! resume the closure has been consumed, so subsequent resumes don't need it. + +use crate::context::switch_to_scheduler; +use crate::pid::Pid; +use crate::stack::Stack; +use std::any::Any; +use std::cell::{Cell, RefCell}; +use std::panic; + +/// What an actor produced when it finished. Stored on the actor's slot, +/// drained by `JoinHandle::join` once the slot is marked done. +pub enum Outcome { + Exit, + Panic(Box), +} + +// Thread-locals that the scheduler writes immediately before `switch_to_actor`. +thread_local! { + /// The closure for the actor we're about to resume *for the first time*. + /// Consumed on first entry into the trampoline; `None` thereafter. + static CURRENT_ACTOR_BOX: RefCell>> = + const { RefCell::new(None) }; + + /// The PID of the actor currently executing on this OS thread. + /// Set on every resume so that `self_pid()` works inside actor code. + static CURRENT_PID: Cell> = const { Cell::new(None) }; + + /// Filled by the trampoline when the actor returns (normally or via + /// panic). The scheduler reads this after `switch_to_actor` returns. + static LAST_OUTCOME: RefCell> = const { RefCell::new(None) }; + + /// Set by the trampoline on completion; reset by the scheduler before + /// each resume so it never sees stale state. + static ACTOR_DONE: Cell = const { Cell::new(false) }; +} + +pub fn set_current_actor_box(b: Box) { + CURRENT_ACTOR_BOX.with(|c| *c.borrow_mut() = Some(b)); +} + +pub fn set_current_pid(p: Pid) { + CURRENT_PID.with(|c| c.set(Some(p))); +} + +pub fn clear_current_pid() { + CURRENT_PID.with(|c| c.set(None)); +} + +pub fn current_pid() -> Option { + CURRENT_PID.with(|c| c.get()) +} + +pub fn reset_actor_done() { + ACTOR_DONE.with(|c| c.set(false)); +} + +pub fn is_actor_done() -> bool { + ACTOR_DONE.with(|c| c.get()) +} + +pub fn take_last_outcome() -> Option { + LAST_OUTCOME.with(|r| r.borrow_mut().take()) +} + +/// The function whose address is written as the `ret` target on every actor +/// stack. The compiler must not inline this away. `extern "C-unwind"` permits +/// unwinding to cross the boundary, but `catch_unwind` here means unwinding +/// never actually does. +pub extern "C-unwind" fn trampoline() { + let b = CURRENT_ACTOR_BOX.with(|c| c.borrow_mut().take()) + .expect("trampoline entered without a closure set"); + + let outcome = match panic::catch_unwind(panic::AssertUnwindSafe(b)) { + Ok(()) => Outcome::Exit, + Err(payload) => Outcome::Panic(payload), + }; + + LAST_OUTCOME.with(|r| *r.borrow_mut() = Some(outcome)); + ACTOR_DONE.with(|c| c.set(true)); + + // Hand control back. The scheduler will tear down our slot and never + // resume us again. + unsafe { switch_to_scheduler() }; + // Unreachable. If it isn't, the scheduler has a bug. + unreachable!("scheduler resumed a done actor"); +} + +/// One actor's worth of state. Owned by the scheduler's slot table. +pub struct Actor { + /// The PID this actor was assigned at spawn time. + pub pid: Pid, + /// The stack the actor runs on. Dropped (munmap'd) when the actor dies. + pub stack: Stack, + /// The saved stack pointer. Updated on every yield. + pub sp: usize, + /// The PID of this actor's supervisor. Used to deliver `Signal` on death. + pub supervisor: Pid, +} diff --git a/src/channel.rs b/src/channel.rs new file mode 100644 index 0000000..2192277 --- /dev/null +++ b/src/channel.rs @@ -0,0 +1,153 @@ +//! Unbounded MPSC channels. +//! +//! Inner state is `Arc>>` so channels can be sent across OS +//! threads (required for the multi-scheduler runtime where a sender and +//! receiver may run on different scheduler threads simultaneously). +//! +//! Semantics: +//! - Senders are clonable; the last sender drop closes the channel. +//! - `Receiver::recv` on an empty open channel parks the receiver. +//! - `Receiver::recv` on an empty closed channel returns `Err(RecvError)`. +//! - `Sender::send` on an open channel always succeeds. +//! - `Sender::send` on a closed channel (receiver dropped) returns +//! `Err(SendError(value))`. +//! - When a send pushes to a previously empty queue and a receiver is +//! parked, the receiver is unparked. + +use crate::pid::Pid; +use std::collections::VecDeque; +use std::sync::{Arc, Mutex}; + +pub fn channel() -> (Sender, Receiver) { + let inner = Arc::new(Mutex::new(Inner { + queue: VecDeque::new(), + parked_receiver: None, + senders: 1, + receiver_alive: true, + })); + (Sender { inner: inner.clone() }, Receiver { inner }) +} + +struct Inner { + queue: VecDeque, + parked_receiver: Option, + senders: usize, + receiver_alive: bool, +} + +pub struct Sender { + inner: Arc>>, +} + +pub struct Receiver { + inner: Arc>>, +} + +#[derive(Debug, PartialEq, Eq)] +pub struct SendError(pub T); + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct RecvError; + +impl std::fmt::Display for RecvError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "channel closed") + } +} + +impl std::error::Error for RecvError {} + +impl Clone for Sender { + fn clone(&self) -> Self { + self.inner.lock().unwrap().senders += 1; + Sender { inner: self.inner.clone() } + } +} + +impl Drop for Sender { + fn drop(&mut self) { + let unpark = { + let mut g = self.inner.lock().unwrap(); + g.senders -= 1; + if g.senders == 0 && g.queue.is_empty() { + g.parked_receiver.take() + } else { + None + } + }; + if let Some(pid) = unpark { + crate::scheduler::unpark(pid); + } + } +} + +impl Drop for Receiver { + fn drop(&mut self) { + self.inner.lock().unwrap().receiver_alive = false; + } +} + +impl Sender { + pub fn send(&self, value: T) -> Result<(), SendError> { + let unpark = { + let mut g = self.inner.lock().unwrap(); + if !g.receiver_alive { + return Err(SendError(value)); + } + g.queue.push_back(value); + g.parked_receiver.take() + }; + if let Some(pid) = unpark { + let me = crate::actor::current_pid(); + crate::te!(crate::trace::Event::Send { sender: me.unwrap_or(crate::pid::Pid::new(u32::MAX, u32::MAX)), receiver: Some(pid) }); + crate::scheduler::unpark(pid); + } else { + let me = crate::actor::current_pid(); + crate::te!(crate::trace::Event::Send { sender: me.unwrap_or(crate::pid::Pid::new(u32::MAX, u32::MAX)), receiver: None }); + } + Ok(()) + } +} + +impl Receiver { + pub fn recv(&self) -> Result { + loop { + { + let mut g = self.inner.lock().unwrap(); + if let Some(v) = g.queue.pop_front() { + return Ok(v); + } + if g.senders == 0 { + return Err(RecvError); + } + let me = crate::actor::current_pid() + .expect("recv() called outside an actor"); + debug_assert!( + g.parked_receiver.is_none(), + "channel has more than one receiver" + ); + g.parked_receiver = Some(me); + crate::te!(crate::trace::Event::RecvPark(me)); + } + // Release the lock before parking — the unparker will need it. + crate::scheduler::park_current(); + // Woken up — record it before looping to check the queue. + if let Some(me) = crate::actor::current_pid() { + crate::te!(crate::trace::Event::RecvWake(me)); + } + } + } + + /// Non-blocking. `Ok(Some(v))` if a message was available, `Ok(None)` if + /// the channel is empty but open, `Err(RecvError)` if closed and drained. + pub fn try_recv(&self) -> Result, RecvError> { + let mut g = self.inner.lock().unwrap(); + if let Some(v) = g.queue.pop_front() { + return Ok(Some(v)); + } + if g.senders == 0 { + return Err(RecvError); + } + Ok(None) + } +} diff --git a/src/context.rs b/src/context.rs new file mode 100644 index 0000000..75c2840 --- /dev/null +++ b/src/context.rs @@ -0,0 +1,106 @@ +//! Cooperative context switching, x86-64. +//! +//! Two naked-asm functions move execution between a scheduler thread and an +//! actor running on its own mmap'd stack. The compiler cannot do this; the +//! whole point of `#[unsafe(naked)]` is that we control every instruction. +//! +//! `SCHEDULER_SP` and `ACTOR_SP` are thread-locals holding each side's saved +//! stack pointer. `init_actor_stack` builds the initial stack so that the +//! first `switch_to_actor` lands inside the entry function with `rsp % 16 == 8` +//! (the x86-64 ABI requirement at function entry). + +use std::cell::Cell; + +thread_local! { + static SCHEDULER_SP: Cell = const { Cell::new(0) }; + static ACTOR_SP: Cell = const { Cell::new(0) }; +} + +fn get_scheduler_sp() -> usize { SCHEDULER_SP.with(|c| c.get()) } +fn set_scheduler_sp(v: usize) { SCHEDULER_SP.with(|c| c.set(v)) } +pub fn get_actor_sp() -> usize { ACTOR_SP.with(|c| c.get()) } +pub fn set_actor_sp(v: usize) { ACTOR_SP.with(|c| c.set(v)) } + +// --------------------------------------------------------------------------- +// Initial stack layout +// +// After alignment, sp = top & ~15 - 8. Then we push (downward) six callee- +// saved register slots and a return address. The first `switch_to_actor` +// pops r15..rbx and `ret`s — landing in `entry` with rsp % 16 == 8. +// +// Layout (high → low), relative to aligned_top = top & ~15: +// aligned_top - 8 : entry ptr ← `ret` target. Post-ret: rsp % 16 == 8. +// aligned_top - 16 : rbx = 0 +// aligned_top - 24 : rbp = 0 +// aligned_top - 32 : r12 = 0 +// aligned_top - 40 : r13 = 0 +// aligned_top - 48 : r14 = 0 +// aligned_top - 56 : r15 = 0 ← initial rsp +// --------------------------------------------------------------------------- + +pub fn init_actor_stack(top: *mut u8, entry: extern "C-unwind" fn()) -> usize { + unsafe { + let mut sp = (top as usize & !15) - 8; + sp -= 8; (sp as *mut usize).write(entry as usize); // ret target + sp -= 8; (sp as *mut usize).write(0); // rbx + sp -= 8; (sp as *mut usize).write(0); // rbp + sp -= 8; (sp as *mut usize).write(0); // r12 + sp -= 8; (sp as *mut usize).write(0); // r13 + sp -= 8; (sp as *mut usize).write(0); // r14 + sp -= 8; (sp as *mut usize).write(0); // r15 + sp + } +} + +// --------------------------------------------------------------------------- +// Context switch shims +// +// Each shim: +// 1. Pushes the six callee-saved integer registers. +// 2. Snaps rsp into rdi and calls the Rust helper that stores it. +// 3. Calls the Rust helper that returns the *other* side's saved rsp. +// 4. Moves that into rsp. +// 5. Pops the six registers and rets. +// +// XMM registers are NOT saved here. We rely on every yield happening through +// a Rust call site, which means the compiler has spilled any live XMM state +// to the stack before we get here. (This is the same argument the compiler +// uses internally — callee-saved regs are what survive a `call`, and the +// SysV AMD64 ABI says XMM0–15 are all caller-saved.) If we ever yield from +// a place that isn't a Rust call boundary, this assumption breaks. +// --------------------------------------------------------------------------- + +#[unsafe(naked)] +unsafe extern "C" fn switch_to_actor_asm() { + core::arch::naked_asm!( + "push rbx", "push rbp", "push r12", "push r13", "push r14", "push r15", + "mov rdi, rsp", + "call {set_sched_sp}", + "call {get_actor_sp}", + "mov rsp, rax", + "pop r15", "pop r14", "pop r13", "pop r12", "pop rbp", "pop rbx", + "ret", + set_sched_sp = sym set_scheduler_sp, + get_actor_sp = sym get_actor_sp, + ); +} + +/// Resume the actor whose sp is in `ACTOR_SP`. Returns when the actor yields. +pub unsafe fn switch_to_actor() { + unsafe { switch_to_actor_asm() }; +} + +#[unsafe(naked)] +pub unsafe extern "C" fn switch_to_scheduler() { + core::arch::naked_asm!( + "push rbx", "push rbp", "push r12", "push r13", "push r14", "push r15", + "mov rdi, rsp", + "call {set_actor_sp}", + "call {get_sched_sp}", + "mov rsp, rax", + "pop r15", "pop r14", "pop r13", "pop r12", "pop rbp", "pop rbx", + "ret", + set_actor_sp = sym set_actor_sp, + get_sched_sp = sym get_scheduler_sp, + ); +} diff --git a/src/io.rs b/src/io.rs new file mode 100644 index 0000000..71a96d3 --- /dev/null +++ b/src/io.rs @@ -0,0 +1,520 @@ +//! Off-scheduler IO: blocking-work offload and epoll-based fd readiness. +//! +//! `block_on_io(closure)` runs `closure` on a dedicated worker OS thread, +//! parks the calling actor in the meantime, and returns the closure's +//! value when it completes. Lets actors call into blocking C libraries, +//! synchronous file IO, or anything else that doesn't fit the readiness +//! model. +//! +//! `wait_readable(fd)` / `wait_writable(fd)` register interest in an fd +//! with epoll and park the calling actor. When the fd becomes ready, the +//! epoll thread unparks the actor. The actual `read(2)`/`write(2)` syscall +//! runs back on the scheduler thread, *inside* the actor — buffer never +//! leaves the actor, no copying through an intermediary thread. Built on +//! these are the conveniences `read(fd, &mut buf)` and `write(fd, &buf)`. +//! +//! Architecture +//! ============ +//! Per `run()`, two OS threads: +//! - **epoll thread**: owns the epollfd. Loops in `epoll_wait`. On a +//! ready fd, pushes `Completion::FdReady { pid, fd, events }` to the +//! shared completion queue and writes the scheduler-wake pipe. On the +//! shutdown pipe (also registered in epollfd), exits. +//! - **pool thread**: blocks on the request mpsc. Runs the closure +//! inside `catch_unwind`, pushes `Completion::Blocking { pid, result }`, +//! writes the scheduler-wake pipe. +//! +//! Both threads share a single `completions: Arc>>` +//! and the same scheduler-wake pipe. +//! +//! `epoll_ctl` (register/unregister fd interest) is called by the +//! scheduler thread *directly* on the epollfd. That's well-defined per +//! `epoll_ctl(2)`: a thread may be calling `epoll_wait` on the epollfd +//! while another thread calls `epoll_ctl`. Avoids needing a second mpsc +//! and a second wake mechanism. +//! +//! Epoll mode +//! ========== +//! Level-triggered with EPOLLONESHOT. After a wakeup the kernel +//! auto-disarms the fd, so we never get two wakeups for one +//! `wait_readable` call. The scheduler explicitly `EPOLL_CTL_DEL`s the fd +//! on completion to free the slot for re-registration. Net effect: each +//! `wait_readable(fd)` is one ADD, one wakeup, one DEL — symmetric and +//! stateless between calls. +//! +//! Fd hygiene +//! ========== +//! If an actor dies while waiting on an fd, the registration is leaked +//! (the fd stays in the epollfd, armed). EPOLLONESHOT bounds the damage: +//! at most one stale wakeup, after which the kernel disarms. The stale +//! wakeup hits a dead pid in `waiters` and is dropped. Acceptable for v0.2; +//! a future pass should DEL on actor death. +//! +//! Buffers used with `read`/`write` should be on fds opened with +//! `O_NONBLOCK`. If they aren't, the syscall may block the scheduler +//! thread despite the readiness notification (the fd reporting readable +//! doesn't guarantee the syscall completes without blocking — e.g. a +//! signal could be delivered). Documented; not enforced. +//! +//! Panic handling +//! ============== +//! The pool worker runs the closure inside `catch_unwind` and ships either +//! the return value or the panic payload back to the scheduler. +//! `block_on_io` resumes the panic on the calling actor's stack, so the +//! actor's supervisor sees a real `Signal::Panic` as if the work had run +//! inline. Fd-wait primitives don't run user code on the IO thread, so +//! they have no equivalent panic-propagation path. + +use crate::pid::Pid; +use std::any::Any; +use std::collections::{HashMap, VecDeque}; +use std::io; +use std::os::fd::RawFd; +use std::panic; +use std::sync::mpsc; +use std::sync::{Arc, Mutex}; +use std::thread::JoinHandle as OsJoinHandle; + +// --------------------------------------------------------------------------- +// Wire types +// --------------------------------------------------------------------------- + +/// What the pool stores while computing a result. `Ok` is the closure's +/// return value (boxed as `Any`); `Err` is the panic payload. +pub type IoResult = Result, Box>; + +struct Request { + pid: Pid, + /// The work to perform. Returns the wire-form result directly. + work: Box IoResult + Send>, +} + +/// Completion message from either IO thread back to the scheduler. +pub enum Completion { + /// A `block_on_io` closure has finished (Ok = return value, Err = panic + /// payload). + Blocking { pid: Pid, result: IoResult }, + /// An fd registered via `wait_readable`/`wait_writable` is ready. The + /// scheduler looks up the parked pid in `waiters`, unparks it, and + /// removes the entry. `pid` isn't in this variant because the epoll + /// thread doesn't have access to the `waiters` map; the scheduler + /// thread owns that. + FdReady { fd: RawFd, events: u32 }, +} + +// --------------------------------------------------------------------------- +// IoThread — created per `run()`, owned by `SchedulerState`. +// --------------------------------------------------------------------------- + +pub struct IoThread { + // ----- Channels & queues ----- + + /// Submission queue into the blocking-work pool. + tx: mpsc::Sender, + /// Shared completion queue, fed by both the pool and the epoll thread. + completions: Arc>>, + /// Pipe the scheduler polls in its idle path. Both IO threads write to + /// `wake_write` after pushing a completion. + wake_read: RawFd, + wake_write: RawFd, + + // ----- Epoll machinery ----- + + /// The epollfd, owned by `IoThread`. Callable cross-thread via + /// `epoll_ctl` per the man page. + epollfd: RawFd, + /// Pipe used to signal the epoll thread to exit. Registered inside the + /// epollfd so a single `epoll_wait` covers both fd readiness and + /// shutdown. + shutdown_read: RawFd, + shutdown_write: RawFd, + /// One parked actor per registered fd. Populated by `wait_readable` / + /// `wait_writable` and drained by the scheduler when a `FdReady` + /// completion is processed. + pub waiters: HashMap, + + // ----- Threads ----- + + pool_thread: Option>, + epoll_thread: Option>, + + /// Number of `block_on_io` requests in-flight. Used by the scheduler's + /// idle path to decide whether to wait on the pipe or exit. Fd waits + /// are not counted here; they're counted by `waiters.len()`. + pub outstanding: u32, +} + +impl IoThread { + pub fn start() -> io::Result { + // Scheduler-facing wake pipe. + let (wake_read, wake_write) = make_pipe()?; + // Pool submission channel + shared completion queue. + let (tx, rx) = mpsc::channel::(); + let completions: Arc>> = + Arc::new(Mutex::new(VecDeque::new())); + + // Epoll machinery. + let epollfd = unsafe { libc::epoll_create1(libc::EPOLL_CLOEXEC) }; + if epollfd < 0 { + // Best-effort fd cleanup before bailing. + unsafe { + libc::close(wake_read); + libc::close(wake_write); + } + return Err(io::Error::last_os_error()); + } + + let (shutdown_read, shutdown_write) = match make_pipe() { + Ok(p) => p, + Err(e) => { + unsafe { + libc::close(epollfd); + libc::close(wake_read); + libc::close(wake_write); + } + return Err(e); + } + }; + + // Register the shutdown pipe in epollfd. We use a sentinel `data` + // value to recognise shutdown events. RawFd values are non-negative, + // so u64::MAX is unambiguously not a real fd-data encoding. + let mut shutdown_ev = libc::epoll_event { + events: libc::EPOLLIN as u32, + u64: SHUTDOWN_EPOLL_TOKEN, + }; + if unsafe { + libc::epoll_ctl( + epollfd, + libc::EPOLL_CTL_ADD, + shutdown_read, + &mut shutdown_ev as *mut _, + ) + } < 0 + { + let e = io::Error::last_os_error(); + unsafe { + libc::close(epollfd); + libc::close(shutdown_read); + libc::close(shutdown_write); + libc::close(wake_read); + libc::close(wake_write); + } + return Err(e); + } + + // Spawn pool thread. + let pool_comps = completions.clone(); + let pool_thread = std::thread::Builder::new() + .name("smarm-io-pool".into()) + .spawn(move || pool_loop(rx, pool_comps, wake_write))?; + + // Spawn epoll thread. + let epoll_comps = completions.clone(); + let epoll_thread = std::thread::Builder::new() + .name("smarm-io-epoll".into()) + .spawn(move || epoll_loop(epollfd, epoll_comps, wake_write))?; + + Ok(Self { + tx, + completions, + wake_read, + wake_write, + epollfd, + shutdown_read, + shutdown_write, + waiters: HashMap::new(), + pool_thread: Some(pool_thread), + epoll_thread: Some(epoll_thread), + outstanding: 0, + }) + } + + /// Hand a request to the pool. Increments `outstanding`. + pub fn submit(&mut self, pid: Pid, work: Box IoResult + Send>) { + self.outstanding += 1; + // Send can only fail if the pool has hung up, which only happens + // on shutdown. submit during shutdown is a bug. + self.tx + .send(Request { pid, work }) + .expect("io pool hung up unexpectedly"); + } + + /// Drain every available completion. Caller (the scheduler) routes the + /// results and updates `outstanding` / `waiters` accordingly. + pub fn drain_completions(&mut self) -> Vec { + let mut q = self.completions.lock().unwrap(); + let mut out = Vec::with_capacity(q.len()); + while let Some(c) = q.pop_front() { + out.push(c); + } + out + } + + pub fn wake_fd(&self) -> RawFd { + self.wake_read + } + + /// Register interest in `fd` becoming readable/writable; record `pid` + /// as the parked waiter. The epoll thread will push a `FdReady` + /// completion when the kernel signals. + /// + /// EPOLLONESHOT: one wakeup per registration. The scheduler must + /// `epoll_del` on completion to free the slot for re-registration. + pub fn epoll_register( + &mut self, + fd: RawFd, + pid: Pid, + readable: bool, + writable: bool, + ) -> io::Result<()> { + // Two actors waiting on the same fd would be a misuse: the kernel + // delivers exactly one EPOLLONESHOT wakeup, so the second waiter + // would hang. Reject up front. + if self.waiters.contains_key(&fd) { + return Err(io::Error::new( + io::ErrorKind::AlreadyExists, + "fd already has a parked waiter", + )); + } + + // Defensive cleanup: if a previous actor died while waiting on this + // fd, the kernel-side registration was leaked (we don't walk all + // waiters on actor death). A bare DEL is harmless if the fd isn't + // registered (ENOENT), and removes any leak. + unsafe { + libc::epoll_ctl(self.epollfd, libc::EPOLL_CTL_DEL, fd, std::ptr::null_mut()); + } + + let mut events: u32 = libc::EPOLLONESHOT as u32; + if readable { + events |= libc::EPOLLIN as u32; + } + if writable { + events |= libc::EPOLLOUT as u32; + } + let mut ev = libc::epoll_event { + events, + u64: fd as u64, + }; + let r = unsafe { + libc::epoll_ctl(self.epollfd, libc::EPOLL_CTL_ADD, fd, &mut ev as *mut _) + }; + if r < 0 { + return Err(io::Error::last_os_error()); + } + self.waiters.insert(fd, pid); + Ok(()) + } + + /// Remove `fd` from the epollfd. Called by the scheduler after a + /// `FdReady` completion, so the next `wait_readable(fd)` can ADD again. + /// + /// Does NOT touch `waiters` — that's the scheduler's bookkeeping; this + /// is purely the kernel-side cleanup. + pub fn epoll_deregister(&mut self, fd: RawFd) { + // EPOLL_CTL_DEL of an already-removed fd returns ENOENT; ignore. + unsafe { + libc::epoll_ctl(self.epollfd, libc::EPOLL_CTL_DEL, fd, std::ptr::null_mut()); + } + } +} + +impl Drop for IoThread { + fn drop(&mut self) { + // 1. Signal the epoll thread to exit by writing the shutdown pipe. + unsafe { + let buf: [u8; 1] = [0]; + // Single byte; we don't care about EINTR retry here — worst + // case the epoll thread blocks until process exit, which is + // fine because we then close fds out from under it. + libc::write(self.shutdown_write, buf.as_ptr() as *const _, 1); + } + + // 2. Hang up the pool's request channel so the pool thread exits. + let (dead_tx, _) = mpsc::channel::(); + let real_tx = std::mem::replace(&mut self.tx, dead_tx); + drop(real_tx); + + // 3. Join both threads. + if let Some(h) = self.epoll_thread.take() { + let _ = h.join(); + } + if let Some(h) = self.pool_thread.take() { + let _ = h.join(); + } + + // 4. Close fds. + unsafe { + libc::close(self.epollfd); + libc::close(self.shutdown_read); + libc::close(self.shutdown_write); + libc::close(self.wake_read); + libc::close(self.wake_write); + } + } +} + +/// Sentinel `epoll_event.u64` distinguishing the shutdown pipe from +/// registered actor fds. RawFd values fit in i32, so the high bits are +/// available for a marker; we use u64::MAX which can't be a valid fd. +const SHUTDOWN_EPOLL_TOKEN: u64 = u64::MAX; + +// --------------------------------------------------------------------------- +// Pool loop +// --------------------------------------------------------------------------- + +fn pool_loop( + rx: mpsc::Receiver, + completions: Arc>>, + wake_write: RawFd, +) { + while let Ok(Request { pid, work }) = rx.recv() { + let result: IoResult = match panic::catch_unwind(panic::AssertUnwindSafe(work)) { + Ok(r) => r, + Err(payload) => Err(payload), + }; + completions + .lock() + .unwrap() + .push_back(Completion::Blocking { pid, result }); + wake_scheduler(wake_write); + } +} + +// --------------------------------------------------------------------------- +// Epoll loop +// --------------------------------------------------------------------------- + +fn epoll_loop( + epollfd: RawFd, + completions: Arc>>, + wake_write: RawFd, +) { + // Buffer for epoll_wait. 64 is plenty for our scale; if a real load + // appears that needs more, this is a one-line change. + const MAX_EVENTS: usize = 64; + let mut events: [libc::epoll_event; MAX_EVENTS] = unsafe { std::mem::zeroed() }; + + loop { + let n = unsafe { + libc::epoll_wait( + epollfd, + events.as_mut_ptr(), + MAX_EVENTS as libc::c_int, + -1, + ) + }; + + if n < 0 { + let e = unsafe { *libc::__errno_location() }; + if e == libc::EINTR { + continue; + } + // Anything else here is a programming error (EBADF on epollfd + // after we've closed it from Drop — the close races with us). + // Treat as shutdown. + return; + } + + let mut shutdown_requested = false; + let mut pushed_any = false; + { + let mut q = completions.lock().unwrap(); + for ev in events.iter().take(n as usize) { + if ev.u64 == SHUTDOWN_EPOLL_TOKEN { + shutdown_requested = true; + continue; + } + let fd = ev.u64 as RawFd; + q.push_back(Completion::FdReady { + fd, + events: ev.events, + }); + pushed_any = true; + } + } + + if pushed_any { + wake_scheduler(wake_write); + } + if shutdown_requested { + return; + } + } +} + +/// Write one byte to the scheduler's wake pipe. Retries on EINTR; ignores +/// EAGAIN (pipe full means there's already an outstanding wake we haven't +/// consumed yet, which is sufficient). +fn wake_scheduler(wake_write: RawFd) { + let buf: [u8; 1] = [0]; + unsafe { + loop { + let n = libc::write(wake_write, buf.as_ptr() as *const _, 1); + if n < 0 { + let e = *libc::__errno_location(); + if e == libc::EINTR { + continue; + } + } + break; + } + } +} + +// --------------------------------------------------------------------------- +// Pipe helpers (unchanged from v0.2) +// --------------------------------------------------------------------------- + +fn make_pipe() -> io::Result<(RawFd, RawFd)> { + let mut fds: [libc::c_int; 2] = [0; 2]; + let r = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC | libc::O_NONBLOCK) }; + if r != 0 { + return Err(io::Error::last_os_error()); + } + Ok((fds[0], fds[1])) +} + +/// Drain pending bytes from the wake pipe. The scheduler calls this after +/// a `poll` wakeup so the next idle call sees an empty pipe. +pub fn drain_wake_pipe(fd: RawFd) { + let mut buf = [0u8; 64]; + loop { + let n = unsafe { libc::read(fd, buf.as_mut_ptr() as *mut _, buf.len()) }; + if n <= 0 { + break; + } + } +} + +/// Block on `fd` for up to `timeout`, returning when either there's data +/// to read or the timeout elapses. `None` for `timeout` means wait forever. +pub fn poll_wake(fd: RawFd, timeout: Option) { + let timeout_ms: libc::c_int = match timeout { + None => -1, + Some(d) => { + let ms = d.as_millis(); + if ms > i32::MAX as u128 { + i32::MAX + } else { + ms as i32 + } + } + }; + let mut pfd = libc::pollfd { + fd, + events: libc::POLLIN, + revents: 0, + }; + loop { + let r = unsafe { libc::poll(&mut pfd as *mut _, 1, timeout_ms) }; + if r < 0 { + let e = unsafe { *libc::__errno_location() }; + if e == libc::EINTR { + continue; + } + } + break; + } +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..cdcf05c --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,60 @@ +//! # smarm — Silly Marks Abstract Rust Machine +//! +//! Erlang-style green-thread actor concurrency for Rust. +//! +//! Multi-threaded: N scheduler OS threads (default: one per CPU) share a +//! single global run queue behind a `Mutex`. Actors communicate by sending +//! `Send` messages over channels; every actor has a supervisor. Synchronisation +//! primitives — `Mutex` with mandatory lock timeouts, channel `recv`, +//! `sleep`, and epoll-backed `wait_readable`/`wait_writable` — all park the +//! green thread, never the OS thread. +//! +//! See `LOOM.md` for the design intent and the deferred-for-later list. + +pub mod stack; +pub mod context; +pub mod preempt; +pub mod pid; +pub mod actor; +pub mod channel; +pub mod scheduler; +pub mod supervisor; +pub mod timer; +pub mod io; +pub mod mutex; +pub mod runtime; +pub mod trace; + +// --------------------------------------------------------------------------- +// Global allocator +// --------------------------------------------------------------------------- + +#[global_allocator] +static ALLOCATOR: preempt::PreemptingAllocator = preempt::PreemptingAllocator; + +// --------------------------------------------------------------------------- +// Public API re-exports +// --------------------------------------------------------------------------- + +pub use channel::{channel, Receiver, RecvError, Sender}; +pub use mutex::{LockTimeout, Mutex, MutexGuard}; +pub use pid::Pid; +pub use runtime::{init, Config, Runtime}; +pub use scheduler::{ + block_on_io, run, self_pid, sleep, spawn, spawn_under, wait_readable, wait_writable, + yield_now, JoinError, JoinHandle, +}; +pub use supervisor::Signal; + +// --------------------------------------------------------------------------- +// check!() +// --------------------------------------------------------------------------- + +/// Voluntarily check whether this actor's timeslice has expired, yielding +/// if so. +#[macro_export] +macro_rules! check { + () => { + $crate::preempt::maybe_preempt() + }; +} diff --git a/src/mutex.rs b/src/mutex.rs new file mode 100644 index 0000000..5bfc239 --- /dev/null +++ b/src/mutex.rs @@ -0,0 +1,248 @@ +//! Actor-aware mutex with mandatory timeout. +//! +//! `Mutex` parks the calling *green* thread on contention rather than +//! blocking the OS thread. Every lock attempt is bounded by a timeout. +//! +//! Internals use `Arc>` so the type is genuinely +//! `Send + Sync` and can be shared across scheduler threads. +//! +//! Fairness: FIFO. Poisoning: none. Reentrance: deadlock (caller bug). + +use crate::pid::Pid; +use crate::scheduler; +use crate::timer::{self, TimerTarget}; +use std::collections::VecDeque; +use std::sync::{Arc, Mutex as StdMutex}; +use std::time::Duration; + +pub const DEFAULT_TIMEOUT: Duration = Duration::from_secs(30); + +#[derive(Debug, PartialEq, Eq, Clone, Copy)] +pub struct LockTimeout; + +impl std::fmt::Display for LockTimeout { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "mutex lock timed out") + } +} +impl std::error::Error for LockTimeout {} + +// --------------------------------------------------------------------------- +// Internals +// --------------------------------------------------------------------------- + +struct Wait { + pid: Pid, + seq: u64, +} + +struct MutexState { + holder: Option, + waiters: VecDeque, + next_seq: u64, + default_timeout: Duration, +} + +struct MutexCore { + state: StdMutex, +} + +impl MutexCore { + fn new(default_timeout: Duration) -> Self { + Self { + state: StdMutex::new(MutexState { + holder: None, + waiters: VecDeque::new(), + next_seq: 0, + default_timeout, + }), + } + } +} + +impl TimerTarget for MutexCore { + fn on_timeout(&self, pid: Pid, wait_seq: u64) { + let unpark = { + let mut st = self.state.lock().unwrap(); + // Remove from waiters only if still there with matching seq. + // If the lock was already granted (holder == Some(pid)), the + // timer fired after the grant — treat as no-op; the actor + // will see `is_holder == true` and return Ok. + if st.holder == Some(pid) { + return; + } + let pos = st.waiters.iter().position(|w| w.pid == pid && w.seq == wait_seq); + if pos.is_some() { + st.waiters.remove(pos.unwrap()); + true + } else { + false + } + }; + if unpark { + scheduler::unpark(pid); + } + } +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +pub struct Mutex { + core: Arc, + /// Protected value. `None` while a guard is live; `Some` while free. + value: Arc>>, +} + +impl Mutex { + pub fn new(value: T) -> Self { + Self { + core: Arc::new(MutexCore::new(DEFAULT_TIMEOUT)), + value: Arc::new(StdMutex::new(Some(value))), + } + } + + pub fn set_default_timeout(&self, timeout: Duration) { + self.core.state.lock().unwrap().default_timeout = timeout; + } + + pub fn lock(&self) -> Result, LockTimeout> { + let timeout = self.core.state.lock().unwrap().default_timeout; + self.lock_timeout(timeout) + } + + pub fn lock_timeout(&self, timeout: Duration) -> Result, LockTimeout> { + // Outside the runtime (e.g. in tests, after run() returns) there is no + // current actor PID. Fall back to a blocking std::sync::Mutex acquire. + let Some(me) = crate::actor::current_pid() else { + return self.lock_blocking(); + }; + + // Fast path: nobody holds it. + { + let mut st = self.core.state.lock().unwrap(); + if st.holder.is_none() { + st.holder = Some(me); + drop(st); + let value = self.value.lock().unwrap().take() + .expect("Mutex: value missing on free fast path"); + return Ok(MutexGuard { mutex: self, value: Some(value) }); + } + } + + // Slow path: register as a waiter, set timeout, park. + let _np = scheduler::NoPreempt::enter(); + let seq = { + let mut st = self.core.state.lock().unwrap(); + let seq = st.next_seq; + st.next_seq = st.next_seq.wrapping_add(1); + st.waiters.push_back(Wait { pid: me, seq }); + seq + }; + + let target: Arc = self.core.clone(); + let deadline = timer::deadline_from_now(timeout); + scheduler::insert_wait_timer(deadline, me, target, seq); + scheduler::park_current(); + + // Resumed. Are we the holder? + let is_holder = self.core.state.lock().unwrap().holder == Some(me); + if is_holder { + let value = self.value.lock().unwrap().take() + .expect("Mutex: value missing after grant"); + Ok(MutexGuard { mutex: self, value: Some(value) }) + } else { + Err(LockTimeout) + } + } + + pub fn try_lock(&self) -> Option> { + let me = crate::actor::current_pid()?; + let mut st = self.core.state.lock().unwrap(); + if st.holder.is_some() { + return None; + } + st.holder = Some(me); + drop(st); + let value = self.value.lock().unwrap().take() + .expect("Mutex: value missing on try_lock free path"); + Some(MutexGuard { mutex: self, value: Some(value) }) + } + + /// Blocking fallback used when called outside the smarm runtime. + /// Spins on the internal std mutex; no actor parking, no timeout. + fn lock_blocking(&self) -> Result, LockTimeout> { + // We have no PID to register as holder, so we bypass the holder/waiter + // tracking and just grab the value mutex directly. This is safe because + // outside the runtime there are no green threads competing. + let value = loop { + let v = self.value.lock().unwrap().take(); + if let Some(v) = v { break v; } + std::thread::yield_now(); + }; + Ok(MutexGuard { mutex: self, value: Some(value) }) + } +} + +impl Clone for Mutex { + fn clone(&self) -> Self { + Self { core: self.core.clone(), value: self.value.clone() } + } +} + +// Genuinely Send + Sync now that internals are Arc>. +unsafe impl Send for Mutex {} +unsafe impl Sync for Mutex {} + +// --------------------------------------------------------------------------- +// Guard +// --------------------------------------------------------------------------- + +pub struct MutexGuard<'a, T> { + mutex: &'a Mutex, + value: Option, +} + +impl std::ops::Deref for MutexGuard<'_, T> { + type Target = T; + fn deref(&self) -> &T { self.value.as_ref().expect("MutexGuard: value missing") } +} + +impl std::ops::DerefMut for MutexGuard<'_, T> { + fn deref_mut(&mut self) -> &mut T { + self.value.as_mut().expect("MutexGuard: value missing") + } +} + +impl std::fmt::Debug for MutexGuard<'_, T> { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.debug_tuple("MutexGuard") + .field(self.value.as_ref().expect("MutexGuard: value missing")) + .finish() + } +} + +impl Drop for MutexGuard<'_, T> { + fn drop(&mut self) { + let v = self.value.take().expect("MutexGuard: double drop"); + *self.mutex.value.lock().unwrap() = Some(v); + + let next_pid = { + let mut st = self.mutex.core.state.lock().unwrap(); + match st.waiters.pop_front() { + Some(w) => { + st.holder = Some(w.pid); + Some(w.pid) + } + None => { + st.holder = None; + None + } + } + }; + if let Some(pid) = next_pid { + scheduler::unpark(pid); + } + } +} diff --git a/src/pid.rs b/src/pid.rs new file mode 100644 index 0000000..2c1a7d9 --- /dev/null +++ b/src/pid.rs @@ -0,0 +1,38 @@ +//! Process identifiers. +//! +//! A `Pid` is `(index, generation)`. The index is a slot in the scheduler's +//! actor table; the generation increments every time that slot is reused. +//! A stale `Pid` (correct index, wrong generation) is a detectable error, +//! not a silent misdirection — solves the ABA problem without exhausting +//! the PID space. + +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Pid { + index: u32, + generation: u32, +} + +impl Pid { + #[inline] + pub const fn new(index: u32, generation: u32) -> Self { + Self { index, generation } + } + + #[inline] + pub const fn index(self) -> u32 { self.index } + + #[inline] + pub const fn generation(self) -> u32 { self.generation } +} + +impl std::fmt::Debug for Pid { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "Pid({}.{})", self.index, self.generation) + } +} + +impl std::fmt::Display for Pid { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "<{}.{}>", self.index, self.generation) + } +} diff --git a/src/preempt.rs b/src/preempt.rs new file mode 100644 index 0000000..4ef52b6 --- /dev/null +++ b/src/preempt.rs @@ -0,0 +1,129 @@ +//! Preemption hooks. +//! +//! Preemption is event-driven: every preemption event decrements a +//! thread-local counter (`ALLOC_COUNT`). When the counter hits zero, we +//! read RDTSC and, if the actor's timeslice has expired, call +//! `switch_to_scheduler` to yield. Resetting the counter to `ALLOC_INTERVAL` +//! amortises the RDTSC across many cheap events. +//! +//! Two event sources today: +//! - `PreemptingAllocator` — heap allocations. +//! - `smarm::check!()` — explicit preemption point for tight no-alloc +//! loops, since stable Rust gives us no transparent way to preempt +//! such loops (`__rust_probestack` is emitted inline by LLVM and not +//! called at runtime). +//! +//! Both sources share `ALLOC_COUNT`, so the timeslice check fires at the +//! same rate regardless of whether the actor is alloc-heavy, check-heavy, +//! or mixed. +//! +//! All state is thread-local. The scheduler enables preemption on resume +//! and disables it on the return path, so the scheduler can never preempt +//! itself. +//! +//! TSC frequency is machine-dependent; `TIMESLICE_CYCLES` is a constant +//! calibrated for ~100µs on a 3 GHz CPU. A real implementation would +//! measure it at startup. For v0.1 the constant suffices. + +use std::alloc::{GlobalAlloc, Layout, System}; +use std::cell::Cell; + +const ALLOC_INTERVAL: u32 = 128; +const TIMESLICE_CYCLES: u64 = 300_000; // ≈ 100µs on a 3 GHz CPU + +thread_local! { + /// While `false`, the allocator hook is a no-op. + pub static PREEMPTION_ENABLED: Cell = const { Cell::new(false) }; + + /// Countdown to next RDTSC check. Reset to `ALLOC_INTERVAL` on resume. + static ALLOC_COUNT: Cell = const { Cell::new(ALLOC_INTERVAL) }; + + /// RDTSC value written by the scheduler on every actor resume. + static TIMESLICE_START: Cell = const { Cell::new(0) }; +} + +/// Arm the timeslice. Called by the scheduler on every resume. +pub fn reset_timeslice() { + ALLOC_COUNT.with(|c| c.set(ALLOC_INTERVAL)); + TIMESLICE_START.with(|c| c.set(rdtsc())); +} + +#[inline(always)] +pub fn rdtsc() -> u64 { + unsafe { + // SAFETY: x86-64 only. `lfence` serialises the instruction stream so + // we don't measure time before prior instructions retire. + core::arch::asm!("lfence", options(nostack, nomem, preserves_flags)); + core::arch::x86_64::_rdtsc() + } +} + +pub struct PreemptingAllocator; + +unsafe impl GlobalAlloc for PreemptingAllocator { + #[inline] + unsafe fn alloc(&self, layout: Layout) -> *mut u8 { + maybe_preempt(); + unsafe { System.alloc(layout) } + } + + #[inline] + unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) { + unsafe { System.dealloc(ptr, layout) } + } + + #[inline] + unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 { + maybe_preempt(); + unsafe { System.alloc_zeroed(layout) } + } + + #[inline] + unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 { + maybe_preempt(); + unsafe { System.realloc(ptr, layout, new_size) } + } +} + +/// Shared preemption check. Called by every preemption event source — the +/// heap allocator today, `smarm::check!()` for tight no-alloc loops. +/// Decrements `ALLOC_COUNT`; every `ALLOC_INTERVAL` calls reads the +/// timeslice clock and yields if expired. +/// +/// **Invariant**: must not be called inside a "prep-to-park" region — +/// e.g. between registering as a channel's parked receiver and calling +/// `park_current()`. A preemption-driven yield in that window would +/// reach the scheduler with state=Runnable, the unparker would no-op, +/// the actor would then park, and the wakeup would be lost. Library +/// code that touches the parking primitives must keep its prep-to-park +/// regions allocation-free and check!()-free. +#[inline(always)] +pub fn maybe_preempt() { + ALLOC_COUNT.with(|c| { + let n = c.get(); + if n == 0 { + c.set(ALLOC_INTERVAL); + if PREEMPTION_ENABLED.with(|e| e.get()) { + let start = TIMESLICE_START.with(|s| s.get()); + if rdtsc().saturating_sub(start) > TIMESLICE_CYCLES { + // SAFETY: reachable only inside an actor (the scheduler + // sets PREEMPTION_ENABLED on resume and clears it on + // return). The scheduler stack is therefore valid. + unsafe { crate::context::switch_to_scheduler() }; + } + } + } else { + c.set(n - 1); + } + }); +} + +// --------------------------------------------------------------------------- +// Test helpers +// --------------------------------------------------------------------------- + +/// Force-expire the timeslice so the next RDTSC check preempts. +pub fn expire_timeslice_for_test() { + TIMESLICE_START.with(|c| c.set(0)); + ALLOC_COUNT.with(|c| c.set(0)); +} diff --git a/src/runtime.rs b/src/runtime.rs new file mode 100644 index 0000000..37481a6 --- /dev/null +++ b/src/runtime.rs @@ -0,0 +1,718 @@ +//! Multi-scheduler runtime: configuration, initialisation, and the shared +//! state that all scheduler OS threads operate against. +//! +//! # Architecture +//! +//! ```text +//! init(Config) → Runtime (Arc) +//! +//! RuntimeInner { +//! shared: Mutex ← slot table, run queue, timers, IO +//! stats: Vec ← one per thread, lockless atomics (RFC 000) +//! io_parked: AtomicU32 ← actors parked on IO +//! sleeping: AtomicU32 ← actors parked on timer +//! } +//! ``` +//! +//! `Runtime::run(f)` spawns N OS threads (one per `Config::resolved_thread_count()`), +//! each running `schedule_loop`. It blocks until all scheduler threads exit, +//! i.e. until the run queue is empty and nothing is pending. +//! +//! Each scheduler thread holds an `Arc` clone. Per-thread +//! identity is a small integer index, stored in a thread-local, used to index +//! into `stats`. +//! +//! # Timer / IO drain (try-lock, one-winner) +//! +//! On each loop iteration every scheduler thread tries `try_lock()` on a +//! separate `drain_lock: Mutex<()>`. The winner drains due timers and IO +//! completions; losers skip and move straight to popping an actor from the +//! run queue. This is the simplest correct approach; revisit if the drain +//! becomes a measured bottleneck. + +use crate::actor::{ + clear_current_pid, current_pid, is_actor_done, reset_actor_done, + set_current_actor_box, set_current_pid, take_last_outcome, Actor, Outcome, +}; +use crate::channel::Sender; +use crate::context::{get_actor_sp, set_actor_sp, switch_to_actor}; +use crate::io::IoThread; +use crate::pid::Pid; +use crate::preempt::PREEMPTION_ENABLED; +use crate::supervisor::Signal; +use crate::timer::Timers; + +use std::collections::VecDeque; +use std::sync::atomic::{AtomicU32, AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; +use std::thread; + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +/// Runtime configuration. +/// +/// ``` +/// use smarm::runtime::Config; +/// +/// // Use all available CPUs (default): +/// let c = Config::default(); +/// +/// // Exactly 4 scheduler threads: +/// let c = Config::exact(4); +/// +/// // Between 2 and 8, clamped to available parallelism: +/// let c = Config::new(2, 8, None); +/// ``` +#[derive(Clone, Debug)] +pub struct Config { + min: usize, + max: usize, + exact: Option, +} + +impl Config { + /// Exact thread count; takes precedence over min/max. + pub fn exact(n: usize) -> Self { + assert!(n >= 1, "scheduler thread count must be ≥ 1"); + Self { min: n, max: n, exact: Some(n) } + } + + /// Bounded range. Thread count = clamp(available_parallelism, min, max). + pub fn new(min: usize, max: usize, exact: Option) -> Self { + assert!(min >= 1, "min must be ≥ 1"); + assert!(max >= min, "max must be ≥ min"); + if let Some(e) = exact { + assert!(e >= 1, "exact must be ≥ 1"); + } + Self { min, max, exact } + } + + /// The number of scheduler threads this config resolves to. + pub fn resolved_thread_count(&self) -> usize { + if let Some(e) = self.exact { + return e; + } + let avail = thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1); + avail.clamp(self.min, self.max) + } +} + +impl Default for Config { + fn default() -> Self { + let avail = thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1); + Self { min: 1, max: avail, exact: None } + } +} + +// --------------------------------------------------------------------------- +// Per-thread stats (RFC 000 Layer 1 primitives) +// --------------------------------------------------------------------------- + +/// Lockless per-scheduler-thread counters. Written only by the owning thread; +/// readable from any thread (introspection actor, tests). +pub struct SchedulerStats { + /// PID index of the actor currently on-CPU, or `u32::MAX` when idle. + pub current_pid_index: AtomicU32, + /// Snapshot of run queue length maintained on every push/pop. + pub run_queue_len: AtomicU64, +} + +impl SchedulerStats { + fn new() -> Self { + Self { + current_pid_index: AtomicU32::new(u32::MAX), + run_queue_len: AtomicU64::new(0), + } + } +} + +// --------------------------------------------------------------------------- +// Runtime stats snapshot (for tests / introspection) +// --------------------------------------------------------------------------- + +pub struct RuntimeStats { + pub(crate) inner: Arc, +} + +impl RuntimeStats { + /// Sum of run queue lengths across all scheduler threads. + pub fn total_run_queue_len(&self) -> u64 { + self.inner.stats.iter() + .map(|s| s.run_queue_len.load(Ordering::Relaxed)) + .sum() + } + + /// Number of scheduler threads. + pub fn scheduler_count(&self) -> usize { + self.inner.stats.len() + } + + /// Actors currently parked on IO. + pub fn io_parked_count(&self) -> u32 { + self.inner.io_parked.load(Ordering::Relaxed) + } + + /// Actors currently sleeping on a timer. + pub fn sleeping_count(&self) -> u32 { + self.inner.sleeping.load(Ordering::Relaxed) + } +} + +// --------------------------------------------------------------------------- +// Shared state (behind Mutex<>) +// --------------------------------------------------------------------------- + +pub(crate) const ACTOR_STACK_SIZE: usize = 64 * 1024; + +#[derive(Debug)] +pub(crate) enum State { Runnable, Parked, Done } + +pub(crate) struct Slot { + pub(crate) generation: u32, + pub(crate) actor: Option, + pub(crate) state: State, + pub(crate) waiters: Vec, + pub(crate) outcome: Option, + pub(crate) supervisor_channel: Option>, + pub(crate) outstanding_handles: u32, + pub(crate) pending_io_result: Option, + /// Set by `unpark()` when the actor is still running (not yet Parked). + /// The scheduler checks this after a Park yield and re-queues instead + /// of sleeping, closing the lost-wakeup window. + pub(crate) pending_unpark: bool, +} + +impl Slot { + fn vacant() -> Self { + Self { + generation: 0, + actor: None, + state: State::Done, + waiters: Vec::new(), + outcome: None, + supervisor_channel: None, + outstanding_handles: 0, + pending_io_result: None, + pending_unpark: false, + } + } +} + +pub(crate) type Closure = Box; + +pub(crate) struct SharedState { + pub(crate) slots: Vec, + pub(crate) free_list: Vec, + pub(crate) run_queue: VecDeque, + pub(crate) root_pid: Option, + pub(crate) timers: Timers, + pub(crate) io: Option, + /// Closures awaiting their first resume, keyed by Pid. + pub(crate) pending_closures: Vec<(Pid, Closure)>, +} + +impl SharedState { + fn new() -> Self { + Self { + slots: Vec::new(), + free_list: Vec::new(), + run_queue: VecDeque::new(), + root_pid: None, + timers: Timers::new(), + io: None, + pending_closures: Vec::new(), + } + } + + pub(crate) fn allocate_slot(&mut self) -> (u32, u32) { + if let Some(idx) = self.free_list.pop() { + let gen = self.slots[idx as usize].generation; + (idx, gen) + } else { + let idx = self.slots.len() as u32; + self.slots.push(Slot::vacant()); + (idx, 0) + } + } + + pub(crate) fn slot(&self, pid: Pid) -> Option<&Slot> { + let s = self.slots.get(pid.index() as usize)?; + if s.generation == pid.generation() { Some(s) } else { None } + } + + pub(crate) fn slot_mut(&mut self, pid: Pid) -> Option<&mut Slot> { + let s = self.slots.get_mut(pid.index() as usize)?; + if s.generation == pid.generation() { Some(s) } else { None } + } + + pub(crate) fn pop_pending_closure(&mut self, pid: Pid) -> Option { + let pos = self.pending_closures.iter().position(|(p, _)| *p == pid)?; + Some(self.pending_closures.swap_remove(pos).1) + } +} + +// --------------------------------------------------------------------------- +// RuntimeInner — the shared core behind an Arc +// --------------------------------------------------------------------------- + +pub(crate) struct RuntimeInner { + pub(crate) shared: Mutex, + /// Try-lock: exactly one scheduler thread drains timers/IO per iteration. + drain_lock: Mutex<()>, + /// Per-thread stats, indexed by scheduler thread slot (0..N). + pub(crate) stats: Vec, + /// Global counters for RFC 000 primitives. + pub(crate) io_parked: AtomicU32, + pub(crate) sleeping: AtomicU32, +} + +impl RuntimeInner { + fn new(thread_count: usize) -> Arc { + let stats = (0..thread_count).map(|_| SchedulerStats::new()).collect(); + Arc::new(Self { + shared: Mutex::new(SharedState::new()), + drain_lock: Mutex::new(()), + stats, + io_parked: AtomicU32::new(0), + sleeping: AtomicU32::new(0), + }) + } + + pub(crate) fn with_shared(&self, f: impl FnOnce(&mut SharedState) -> R) -> R { + // Preemption must be off while we hold the shared mutex. If an actor + // called with_shared (e.g. from spawn, join, sleep) and the allocator + // fired maybe_preempt() while the lock was held, switch_to_scheduler() + // would context-switch to the scheduler loop, which would immediately + // deadlock trying to acquire the same mutex. + let prev = crate::preempt::PREEMPTION_ENABLED.with(|c| c.replace(false)); + let result = f(&mut self.shared.lock().unwrap()); + crate::preempt::PREEMPTION_ENABLED.with(|c| c.set(prev)); + result + } + + /// Returns `None` when the mutex is poisoned. + /// Used in `unpark` / channel Drop which can fire after teardown. + pub(crate) fn try_with_shared(&self, f: impl FnOnce(&mut SharedState) -> R) -> Option { + let prev = crate::preempt::PREEMPTION_ENABLED.with(|c| c.replace(false)); + let result = match self.shared.lock() { + Ok(mut g) => Some(f(&mut g)), + Err(p) => Some(f(&mut p.into_inner())), + }; + crate::preempt::PREEMPTION_ENABLED.with(|c| c.set(prev)); + result + } +} + +// --------------------------------------------------------------------------- +// Runtime — the public handle +// --------------------------------------------------------------------------- + +pub struct Runtime { + inner: Arc, + thread_count: usize, +} + +/// Initialise the runtime with the given config. Returns a reusable handle. +pub fn init(config: Config) -> Runtime { + let n = config.resolved_thread_count(); + Runtime { + inner: RuntimeInner::new(n), + thread_count: n, + } +} + +impl Runtime { + /// Run `f` as the initial actor, block until all actors finish. + /// Can be called multiple times sequentially on the same `Runtime`. + pub fn run(&self, f: impl FnOnce() + Send + 'static) { + // Open the trace store for this run (no-op without smarm-trace). + #[cfg(feature = "smarm-trace")] + crate::trace::open(); + + // Re-initialise shared state for this run. + { + let mut s = self.inner.shared.lock().unwrap(); + assert!(s.run_queue.is_empty(), "run() called while previous run still active"); + s.root_pid = Some(ROOT_PID); + s.io = Some(IoThread::start().expect("failed to start IO thread")); + } + + // Spawn the initial actor through the public spawn path (which + // requires a running runtime in the thread-local). + RUNTIME.with(|r| *r.borrow_mut() = Some(self.inner.clone())); + let initial_handle = crate::scheduler::spawn(f); + + // Launch N-1 extra scheduler threads. The calling thread is thread 0. + let mut os_threads = Vec::new(); + for slot in 1..self.thread_count { + let inner = self.inner.clone(); + let t = thread::spawn(move || { + RUNTIME.with(|r| *r.borrow_mut() = Some(inner.clone())); + SCHED_SLOT.with(|s| s.set(slot)); + schedule_loop(&inner, slot); + RUNTIME.with(|r| *r.borrow_mut() = None); + }); + os_threads.push(t); + } + + // Thread 0 runs the loop on the calling thread. + SCHED_SLOT.with(|s| s.set(0)); + schedule_loop(&self.inner, 0); + + // Wait for all other scheduler threads. + for t in os_threads { + let _ = t.join(); + } + + // Drop initial handle (decrements outstanding_handles count). + drop(initial_handle); + + // Tear down IO and clean up shared state for the next run() call. + let mut s = self.inner.shared.lock().unwrap(); + drop(s.io.take()); // joins IO threads + s.pending_closures.clear(); + // Reset per-thread stats. + for stat in &self.inner.stats { + stat.current_pid_index.store(u32::MAX, Ordering::Relaxed); + stat.run_queue_len.store(0, Ordering::Relaxed); + } + self.inner.io_parked.store(0, Ordering::Relaxed); + self.inner.sleeping.store(0, Ordering::Relaxed); + + RUNTIME.with(|r| *r.borrow_mut() = None); + + // Flush trace to disk (no-op without smarm-trace). + #[cfg(feature = "smarm-trace")] + crate::trace::flush(); + } + + /// Snapshot of runtime statistics for introspection / tests. + pub fn stats(&self) -> RuntimeStats { + RuntimeStats { inner: self.inner.clone() } + } +} + +// --------------------------------------------------------------------------- +// Thread-locals +// --------------------------------------------------------------------------- + +use std::cell::{Cell, RefCell}; + +thread_local! { + /// The RuntimeInner for the current run(). Set by run() on the calling + /// thread and by each spawned scheduler thread. + pub(crate) static RUNTIME: RefCell>> = + const { RefCell::new(None) }; + + /// This scheduler thread's index into RuntimeInner::stats. + static SCHED_SLOT: Cell = const { Cell::new(0) }; + + /// What the actor wants when it yields back to the scheduler. + static YIELD_INTENT: Cell = const { Cell::new(YieldIntent::Yield) }; +} + +#[derive(Copy, Clone)] +pub(crate) enum YieldIntent { Yield, Park } + +pub(crate) fn set_yield_intent(i: YieldIntent) { + YIELD_INTENT.with(|c| c.set(i)); +} + +// --------------------------------------------------------------------------- +// Sentinel root PID +// --------------------------------------------------------------------------- + +pub const ROOT_PID: Pid = Pid::new(u32::MAX, u32::MAX); + +// --------------------------------------------------------------------------- +// Slot reclamation +// --------------------------------------------------------------------------- + +pub(crate) fn reclaim_slot(s: &mut SharedState, pid: Pid) { + let idx = pid.index(); + let slot = &mut s.slots[idx as usize]; + slot.generation = slot.generation.wrapping_add(1); + slot.actor = None; + slot.outcome = None; + slot.waiters.clear(); + slot.supervisor_channel = None; + slot.state = State::Done; + slot.outstanding_handles = 0; + slot.pending_unpark = false; + slot.pending_io_result = None; + s.free_list.push(idx); +} + +// --------------------------------------------------------------------------- +// finalize_actor +// --------------------------------------------------------------------------- + +fn finalize_actor(inner: &Arc, pid: Pid, outcome: Outcome) { + let (joiner_outcome, sup_signal) = match outcome { + Outcome::Exit => (Outcome::Exit, Signal::Exit(pid)), + Outcome::Panic(payload) => ( + Outcome::Panic(payload), + Signal::Panic(pid, Box::new(()) as Box), + ), + }; + + let (waiters, supervisor_pid) = inner.with_shared(|s| { + let slot = s.slot_mut(pid).expect("finalize_actor: slot vanished"); + let sup = slot.actor.as_ref().map(|a| a.supervisor); + slot.outcome = Some(joiner_outcome); + slot.state = State::Done; + slot.actor = None; + (std::mem::take(&mut slot.waiters), sup) + }); + + // Deliver to supervisor. + if let Some(sup) = supervisor_pid { + let sender = inner.with_shared(|s| { + s.slot(sup).and_then(|slot| slot.supervisor_channel.clone()) + }); + if let Some(sender) = sender { + let _ = sender.send(sup_signal); + } + } + + // Unpark joiners. + for joiner in waiters { + crate::scheduler::unpark(joiner); + } + + // Reclaim if no outstanding handles. + inner.with_shared(|s| { + let reclaim = s.slot(pid).map(|slot| slot.outstanding_handles == 0).unwrap_or(false); + if reclaim { reclaim_slot(s, pid); } + }); +} + +// --------------------------------------------------------------------------- +// schedule_loop — runs on each scheduler OS thread +// --------------------------------------------------------------------------- + +fn schedule_loop(inner: &Arc, slot: usize) { + let stats = &inner.stats[slot]; + + loop { + // ---------------------------------------------------------------- + // 1. Try to win the drain lock (timers + IO). One winner per round; + // losers skip immediately and proceed to step 2. + // ---------------------------------------------------------------- + if let Ok(_drain_guard) = inner.drain_lock.try_lock() { + let now = std::time::Instant::now(); + + // Drain due timers. + let due = inner.with_shared(|s| s.timers.pop_due(now)); + for entry in due { + match entry.reason { + crate::timer::Reason::Sleep => { + inner.with_shared(|s| { + if let Some(slot) = s.slot_mut(entry.pid) { + if matches!(slot.state, State::Parked) { + slot.state = State::Runnable; + s.run_queue.push_back(entry.pid); + crate::te!(crate::trace::Event::Enqueue(entry.pid)); + } + } + }); + } + crate::timer::Reason::WaitTimeout { target, wait_seq } => { + // Runs outside with_shared — the callback may call unpark. + target.on_timeout(entry.pid, wait_seq); + } + } + } + + // Drain IO completions. + let completions = inner.with_shared(|s| { + s.io.as_mut().map(|io| io.drain_completions()).unwrap_or_default() + }); + for completion in completions { + match completion { + crate::io::Completion::Blocking { pid, result } => { + inner.with_shared(|s| { + if let Some(io) = s.io.as_mut() { + io.outstanding = io.outstanding.saturating_sub(1); + } + if let Some(slot) = s.slot_mut(pid) { + slot.pending_io_result = Some(result); + if matches!(slot.state, State::Parked) { + slot.state = State::Runnable; + s.run_queue.push_back(pid); + crate::te!(crate::trace::Event::Enqueue(pid)); + } + } + }); + } + crate::io::Completion::FdReady { fd, events: _ } => { + inner.with_shared(|s| { + let parked_pid = s.io.as_mut().and_then(|io| { + let pid = io.waiters.remove(&fd); + io.epoll_deregister(fd); + pid + }); + if let Some(pid) = parked_pid { + if let Some(slot) = s.slot_mut(pid) { + if matches!(slot.state, State::Parked) { + slot.state = State::Runnable; + s.run_queue.push_back(pid); + crate::te!(crate::trace::Event::Enqueue(pid)); + } + } + } + }); + } + } + } + } // drain_guard drops here + + // ---------------------------------------------------------------- + // 2. Pop a runnable actor from the shared queue. + // ---------------------------------------------------------------- + let pid = match inner.with_shared(|s| { + let len = s.run_queue.len() as u64; + stats.run_queue_len.store(len, Ordering::Relaxed); + s.run_queue.pop_front() + }) { + Some(p) => { + crate::te!(crate::trace::Event::Dequeue(p)); + p + } + None => { + // Nothing runnable. Check whether we should wait or exit. + let (next_deadline, io_outstanding, wake_fd, queue_empty, live_actors) = + inner.with_shared(|s| { + let next = s.timers.peek_deadline(); + let (out, fd) = match s.io.as_ref() { + Some(io) => ( + io.outstanding + io.waiters.len() as u32, + Some(io.wake_fd()), + ), + None => (0, None), + }; + // Count actors that are not Done (Runnable or Parked). + let live = s.slots.iter().filter(|slot| { + slot.actor.is_some() + }).count(); + (next, out, fd, s.run_queue.is_empty(), live) + }); + + match (next_deadline, io_outstanding, wake_fd, queue_empty, live_actors) { + // Queue is now non-empty (another thread added work): retry. + (_, _, _, false, _) => continue, + // Truly idle — no timers, no IO, no live actors. + (None, 0, _, true, 0) => return, + // Live actors but queue empty: they must be parked on IO or + // timers. Wait on the appropriate source. + (Some(deadline), _, fd_opt, true, _) => { + let now = std::time::Instant::now(); + if deadline > now { + let timeout = deadline - now; + match fd_opt { + Some(fd) => { + crate::io::poll_wake(fd, Some(timeout)); + crate::io::drain_wake_pipe(fd); + } + None => thread::sleep(timeout), + } + } + continue; + } + (None, _, Some(fd), true, _) => { + crate::io::poll_wake(fd, None); + crate::io::drain_wake_pipe(fd); + continue; + } + // Live actors, queue empty, no IO/timers: they're parked + // waiting for each other (potential deadlock in user code), + // or another thread is about to add work. Sleep briefly to + // avoid hammering the shared mutex. + _ => { + thread::sleep(std::time::Duration::from_micros(100)); + continue; + } + } + } + }; + + // ---------------------------------------------------------------- + // 3. Resume the actor. + // ---------------------------------------------------------------- + let sp = match inner.with_shared(|s| { + s.slot(pid).and_then(|slot| slot.actor.as_ref().map(|a| a.sp)) + }) { + Some(sp) => sp, + None => continue, // stale pid + }; + + // First resume: move the closure into the trampoline's thread-local. + if let Some(b) = inner.with_shared(|s| s.pop_pending_closure(pid)) { + set_current_actor_box(b); + } + + // Update per-thread stats: record who's on-CPU. + stats.current_pid_index.store(pid.index(), Ordering::Relaxed); + + set_actor_sp(sp); + set_current_pid(pid); + reset_actor_done(); + YIELD_INTENT.with(|c| c.set(YieldIntent::Yield)); + crate::preempt::reset_timeslice(); + PREEMPTION_ENABLED.with(|c| c.set(true)); + + crate::te!(crate::trace::Event::Resume(pid)); + unsafe { switch_to_actor() }; + + PREEMPTION_ENABLED.with(|c| c.set(false)); + stats.current_pid_index.store(u32::MAX, Ordering::Relaxed); + clear_current_pid(); + + let intent = YIELD_INTENT.with(|c| c.get()); + let new_sp = get_actor_sp(); + + if is_actor_done() { + crate::te!(crate::trace::Event::Done(pid)); + let outcome = take_last_outcome().unwrap_or(Outcome::Exit); + finalize_actor(inner, pid, outcome); + } else { + inner.with_shared(|s| { + if let Some(slot) = s.slot_mut(pid) { + if let Some(actor) = slot.actor.as_mut() { + actor.sp = new_sp; + } + match intent { + YieldIntent::Yield => { + crate::te!(crate::trace::Event::Yield(pid)); + slot.state = State::Runnable; + s.run_queue.push_back(pid); + crate::te!(crate::trace::Event::Enqueue(pid)); + } + YieldIntent::Park => { + // Check if unpark() fired while the actor was + // still running (between registering in the + // channel and calling park_current). If so, + // re-queue immediately instead of parking. + if slot.pending_unpark { + slot.pending_unpark = false; + slot.state = State::Runnable; + s.run_queue.push_back(pid); + crate::te!(crate::trace::Event::UnparkFlagConsumed(pid)); + crate::te!(crate::trace::Event::Enqueue(pid)); + } else { + crate::te!(crate::trace::Event::Park(pid)); + slot.state = State::Parked; + } + } + } + } + }); + } + } +} diff --git a/src/scheduler.rs b/src/scheduler.rs new file mode 100644 index 0000000..efde859 --- /dev/null +++ b/src/scheduler.rs @@ -0,0 +1,349 @@ +//! Scheduler public API — thin façade over the multi-scheduler runtime. +//! +//! All heavy lifting lives in `runtime.rs`. This module exposes the same +//! surface that the rest of the codebase (channel, mutex, io, timer, actor) +//! calls into, plus the public API re-exported from `lib.rs`. +//! +//! The single-threaded `run()` entry point is kept as a convenience wrapper +//! around `runtime::init(Config::exact(1)).run(f)`. + +use crate::actor::current_pid; +use crate::channel::Sender; +use crate::pid::Pid; +use crate::runtime::{ + self, RuntimeInner, YieldIntent, ROOT_PID, RUNTIME, +}; +use crate::supervisor::Signal; +use std::sync::Arc; + +// --------------------------------------------------------------------------- +// with_runtime / try_with_runtime +// --------------------------------------------------------------------------- + +/// Borrow the current runtime. Panics if called outside `Runtime::run()`. +pub(crate) fn with_runtime(f: impl FnOnce(&Arc) -> R) -> R { + RUNTIME.with(|r| { + let b = r.borrow(); + let inner = b.as_ref().expect("smarm: not inside Runtime::run()"); + f(inner) + }) +} + +/// Borrow the runtime if present; returns `None` otherwise. +/// Used on cleanup paths (channel Drop during teardown). +pub(crate) fn try_with_runtime(f: impl FnOnce(&Arc) -> R) -> Option { + RUNTIME.with(|r| r.borrow().as_ref().map(|inner| f(inner))) +} + +// --------------------------------------------------------------------------- +// JoinHandle / JoinError +// --------------------------------------------------------------------------- + +#[derive(Debug)] +pub struct JoinError { + pub payload: Box, +} + +pub struct JoinHandle { + pid: Pid, + consumed: bool, +} + +impl JoinHandle { + pub fn pid(&self) -> Pid { self.pid } + + pub fn join(mut self) -> Result<(), JoinError> { + use crate::actor::Outcome; + use crate::runtime::State; // need State visibility + + let me = current_pid().expect("join() called outside an actor"); + + loop { + let outcome = with_runtime(|inner| { + inner.with_shared(|s| { + let slot = s.slot_mut(self.pid) + .expect("join: target slot has been reused"); + if matches!(slot.state, State::Done) { + Some(slot.outcome.take().expect("Done slot must have outcome")) + } else { + slot.waiters.push(me); + None + } + }) + }); + + match outcome { + Some(o) => { + self.consumed = true; + self.decrement_handle_count(); + return match o { + Outcome::Exit => Ok(()), + Outcome::Panic(p) => Err(JoinError { payload: p }), + }; + } + None => { + let _np = NoPreempt::enter(); + park_current(); + } + } + } + } + + fn decrement_handle_count(&mut self) { + with_runtime(|inner| { + inner.with_shared(|s| { + let should_reclaim = match s.slot_mut(self.pid) { + Some(slot) => { + slot.outstanding_handles = + slot.outstanding_handles.saturating_sub(1); + matches!(slot.state, crate::runtime::State::Done) + && slot.outstanding_handles == 0 + } + None => false, + }; + if should_reclaim { + crate::runtime::reclaim_slot(s, self.pid); + } + }) + }); + } +} + +impl Drop for JoinHandle { + fn drop(&mut self) { + if !self.consumed { + // May be called outside run() if handle is dropped after teardown. + if try_with_runtime(|_| ()).is_some() { + self.decrement_handle_count(); + } + } + } +} + +// --------------------------------------------------------------------------- +// spawn / spawn_under / self_pid +// --------------------------------------------------------------------------- + +pub fn spawn(f: impl FnOnce() + Send + 'static) -> JoinHandle { + let parent = current_pid() + .or_else(|| with_runtime(|inner| inner.with_shared(|s| s.root_pid))) + .expect("spawn() before run()"); + spawn_under(parent, f) +} + +pub fn spawn_under(supervisor: Pid, f: impl FnOnce() + Send + 'static) -> JoinHandle { + let pid = with_runtime(|inner| { + inner.with_shared(|s| { + let (idx, gen) = s.allocate_slot(); + let pid = Pid::new(idx, gen); + let stack = crate::stack::Stack::new(crate::runtime::ACTOR_STACK_SIZE) + .expect("stack allocation failed"); + let sp = init_actor_stack(stack.top(), crate::actor::trampoline); + let slot = &mut s.slots[idx as usize]; + slot.actor = Some(crate::actor::Actor { pid, stack, sp, supervisor }); + slot.state = crate::runtime::State::Runnable; + slot.outstanding_handles = 1; + slot.outcome = None; + slot.waiters.clear(); + slot.supervisor_channel = None; + slot.pending_unpark = false; + slot.pending_io_result = None; + s.run_queue.push_back(pid); + s.pending_closures.push((pid, Box::new(f) as crate::runtime::Closure)); + crate::te!(crate::trace::Event::Spawn { parent: supervisor, child: pid }); + crate::te!(crate::trace::Event::Enqueue(pid)); + pid + }) + }); + + JoinHandle { pid, consumed: false } +} + +use crate::context::init_actor_stack; + +pub fn self_pid() -> Pid { + current_pid().expect("self_pid() called outside an actor") +} + +// --------------------------------------------------------------------------- +// yield_now / park_current / unpark +// --------------------------------------------------------------------------- + +pub fn yield_now() { + runtime::set_yield_intent(YieldIntent::Yield); + unsafe { crate::context::switch_to_scheduler() }; +} + +pub fn park_current() { + runtime::set_yield_intent(YieldIntent::Park); + unsafe { crate::context::switch_to_scheduler() }; +} + +pub fn unpark(pid: Pid) { + let result = try_with_runtime(|inner| { + inner.with_shared(|s| { + if let Some(slot) = s.slot_mut(pid) { + match slot.state { + crate::runtime::State::Parked => { + // Actor is suspended — safe to re-queue immediately. + slot.state = crate::runtime::State::Runnable; + s.run_queue.push_back(pid); + crate::te!(crate::trace::Event::UnparkDirect(pid)); + crate::te!(crate::trace::Event::Enqueue(pid)); + } + crate::runtime::State::Runnable => { + // Actor is still running (between registering its + // parked_receiver and calling park_current). Set the + // flag; the scheduler will re-queue after the Park + // yield instead of sleeping. + slot.pending_unpark = true; + crate::te!(crate::trace::Event::UnparkDeferred(pid)); + } + crate::runtime::State::Done => {} + } + } + }) + }); + let _ = result; +} + +// --------------------------------------------------------------------------- +// NoPreempt +// --------------------------------------------------------------------------- + +pub struct NoPreempt(bool); + +impl NoPreempt { + pub fn enter() -> Self { + let prev = crate::preempt::PREEMPTION_ENABLED.with(|c| c.replace(false)); + NoPreempt(prev) + } +} + +impl Drop for NoPreempt { + fn drop(&mut self) { + crate::preempt::PREEMPTION_ENABLED.with(|c| c.set(self.0)); + } +} + +// --------------------------------------------------------------------------- +// sleep / insert_wait_timer +// --------------------------------------------------------------------------- + +pub fn sleep(duration: std::time::Duration) { + let me = current_pid().expect("sleep() called outside an actor"); + let _np = NoPreempt::enter(); + let deadline = crate::timer::deadline_from_now(duration); + with_runtime(|inner| inner.with_shared(|s| s.timers.insert_sleep(deadline, me))); + park_current(); +} + +pub fn insert_wait_timer( + deadline: std::time::Instant, + pid: Pid, + target: std::sync::Arc, + wait_seq: u64, +) { + with_runtime(|inner| { + inner.with_shared(|s| { + s.timers.insert( + deadline, + pid, + crate::timer::Reason::WaitTimeout { target, wait_seq }, + ); + }) + }); +} + +// --------------------------------------------------------------------------- +// block_on_io / wait_readable / wait_writable / read / write +// --------------------------------------------------------------------------- + +pub fn block_on_io(f: F) -> T +where + F: FnOnce() -> T + Send + 'static, + T: Send + 'static, +{ + let me = current_pid().expect("block_on_io() called outside an actor"); + let work: Box crate::io::IoResult + Send> = Box::new(move || { + let v: T = f(); + Ok(Box::new(v) as Box) + }); + { + let _np = NoPreempt::enter(); + with_runtime(|inner| inner.with_shared(|s| { + let io = s.io.as_mut().expect("io thread not started"); + io.submit(me, work); + })); + park_current(); + } + let result = with_runtime(|inner| inner.with_shared(|s| { + s.slot_mut(me) + .expect("block_on_io: own slot vanished") + .pending_io_result + .take() + .expect("block_on_io: resumed without a result") + })); + match result { + Ok(any) => *any.downcast::().expect("block_on_io: type mismatch"), + Err(payload) => std::panic::resume_unwind(payload), + } +} + +pub fn wait_readable(fd: std::os::fd::RawFd) -> std::io::Result<()> { + wait_fd(fd, true, false) +} + +pub fn wait_writable(fd: std::os::fd::RawFd) -> std::io::Result<()> { + wait_fd(fd, false, true) +} + +fn wait_fd(fd: std::os::fd::RawFd, readable: bool, writable: bool) -> std::io::Result<()> { + let me = current_pid().expect("wait_*() called outside an actor"); + let _np = NoPreempt::enter(); + with_runtime(|inner| inner.with_shared(|s| { + let io = s.io.as_mut().expect("io thread not started"); + io.epoll_register(fd, me, readable, writable) + }))?; + park_current(); + Ok(()) +} + +pub fn read(fd: std::os::fd::RawFd, buf: &mut [u8]) -> std::io::Result { + wait_readable(fd)?; + let n = unsafe { libc::read(fd, buf.as_mut_ptr() as *mut _, buf.len()) }; + if n < 0 { Err(std::io::Error::last_os_error()) } else { Ok(n as usize) } +} + +pub fn write(fd: std::os::fd::RawFd, buf: &[u8]) -> std::io::Result { + wait_writable(fd)?; + let n = unsafe { libc::write(fd, buf.as_ptr() as *const _, buf.len()) }; + if n < 0 { Err(std::io::Error::last_os_error()) } else { Ok(n as usize) } +} + +// --------------------------------------------------------------------------- +// register_supervisor_channel +// --------------------------------------------------------------------------- + +pub fn register_supervisor_channel(pid: Pid, sender: Sender) { + with_runtime(|inner| inner.with_shared(|s| { + if let Some(slot) = s.slot_mut(pid) { + slot.supervisor_channel = Some(sender); + } else { + panic!("register_supervisor_channel: pid {:?} not found", pid); + } + })); +} + +// --------------------------------------------------------------------------- +// Legacy run() — convenience wrapper +// --------------------------------------------------------------------------- + +/// Single-threaded runtime entry point (backwards-compatible wrapper). +/// Equivalent to `runtime::init(Config::exact(1)).run(f)`. +pub fn run(f: F) { + crate::runtime::init(crate::runtime::Config::exact(1)).run(f); +} + + + diff --git a/src/stack.rs b/src/stack.rs new file mode 100644 index 0000000..b742531 --- /dev/null +++ b/src/stack.rs @@ -0,0 +1,89 @@ +//! mmap-based growable stack with a guard page below. +//! +//! Layout (low → high address): +//! [ guard page (PROT_NONE) | stack region ] +//! ^ top() — initial stack pointer +//! +//! Stacks grow downward. Overflow lands in the guard page → SIGSEGV. + +use std::io; + +pub struct Stack { + /// Bottom of the entire mmap'd region (start of guard page). + base: *mut u8, + /// Total mmap'd size: guard_size + stack_size. + total_size: usize, + /// Usable stack size (excluding guard page). + stack_size: usize, +} + +// Stack owns its memory; safe to send across threads. +unsafe impl Send for Stack {} + +impl Stack { + /// Allocate a new stack. `stack_size` is the usable region; one page is + /// added below as a guard page. Both are rounded up to the page size. + pub fn new(stack_size: usize) -> io::Result { + let page = page_size(); + let stack_size = round_up(stack_size, page); + let guard_size = page; + let total_size = guard_size + stack_size; + + let base = unsafe { + libc::mmap( + std::ptr::null_mut(), + total_size, + libc::PROT_READ | libc::PROT_WRITE, + libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, + -1, + 0, + ) + }; + if base == libc::MAP_FAILED { + return Err(io::Error::last_os_error()); + } + let base = base as *mut u8; + + let ret = unsafe { + libc::mprotect(base as *mut libc::c_void, guard_size, libc::PROT_NONE) + }; + if ret != 0 { + let err = io::Error::last_os_error(); + unsafe { libc::munmap(base as *mut libc::c_void, total_size) }; + return Err(err); + } + + Ok(Self { base, total_size, stack_size }) + } + + /// 16-byte-aligned top of the usable region. + pub fn top(&self) -> *mut u8 { + let raw_top = self.base as usize + self.total_size; + (raw_top & !15) as *mut u8 + } + + /// Pointer to the bottom of the usable region (just above the guard page). + pub fn usable_base(&self) -> *mut u8 { + unsafe { self.base.add(page_size()) } + } + + pub fn stack_size(&self) -> usize { + self.stack_size + } +} + +impl Drop for Stack { + fn drop(&mut self) { + unsafe { + libc::munmap(self.base as *mut libc::c_void, self.total_size); + } + } +} + +fn page_size() -> usize { + unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize } +} + +fn round_up(n: usize, align: usize) -> usize { + (n + align - 1) & !(align - 1) +} diff --git a/src/supervisor.rs b/src/supervisor.rs new file mode 100644 index 0000000..2ecc35d --- /dev/null +++ b/src/supervisor.rs @@ -0,0 +1,37 @@ +//! Supervision signals. +//! +//! Every actor has a supervisor, which is itself just an actor with a +//! `Receiver`. When a child actor terminates, the scheduler sends +//! a `Signal` on the supervisor's channel. The supervisor decides what to +//! do — restart, escalate, ignore. +//! +//! For v0.1 there is no built-in restart-intensity cap. That's policy and +//! lives in user code; library is mechanism only. + +use crate::pid::Pid; +use std::any::Any; + +pub enum Signal { + /// The child exited normally. + Exit(Pid), + /// The child panicked. Payload is whatever `panic!` was called with. + Panic(Pid, Box), +} + +impl std::fmt::Debug for Signal { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Signal::Exit(pid) => write!(f, "Signal::Exit({:?})", pid), + Signal::Panic(pid, _) => write!(f, "Signal::Panic({:?}, ..)", pid), + } + } +} + +impl Signal { + pub fn pid(&self) -> Pid { + match self { + Signal::Exit(p) => *p, + Signal::Panic(p, _) => *p, + } + } +} diff --git a/src/timer.rs b/src/timer.rs new file mode 100644 index 0000000..c3b1549 --- /dev/null +++ b/src/timer.rs @@ -0,0 +1,147 @@ +//! Sleep + wait-with-timeout timers. +//! +//! A min-heap of `(deadline, seq, reason)` entries lives on `SchedulerState`. +//! When an actor sleeps or starts a bounded wait (e.g. `mutex.lock()` with a +//! timeout), the runtime inserts an entry, marks the actor parked, and yields. +//! On every scheduler loop iteration the runtime pops all entries whose +//! deadline has passed and dispatches each according to its `Reason`: +//! +//! - `Sleep`: unpark the actor. +//! - `WaitTimeout`: call `on_timeout` on the registered target. The target +//! (e.g. a `Mutex`) decides whether the actor was actually still waiting +//! (timer fires first → unpark with error) or had already been granted +//! what it was waiting for (lock granted first → no-op). +//! +//! `BinaryHeap` is a max-heap; entries are wrapped in `Reverse` to get +//! min-heap behaviour. +//! +//! No cancellation. When a non-timer wakeup happens (e.g. lock granted +//! before timeout), the timer entry is left in the heap. It will be popped +//! eventually and the dispatch will observe "actor is no longer parked / +//! wait_seq is stale" and no-op. Cost is ~32 bytes per stale entry plus a +//! few cycles on pop; acceptable given the upper bound is "one entry per +//! parked actor". +//! +//! Stale pids (slot reused since the timer was inserted) are filtered on +//! pop by the scheduler — same convention as the run queue. + +use crate::pid::Pid; +use std::cmp::Reverse; +use std::collections::BinaryHeap; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +/// What to do when a timer entry's deadline arrives. +/// +/// Held inside `Entry`, dispatched by the scheduler in `pop_due`. +pub enum Reason { + /// `loom::sleep(d)`. Unpark `pid` unconditionally (modulo the usual + /// "still parked?" check the scheduler applies). + Sleep, + /// A bounded wait — currently only `Mutex::lock_timeout`. On expiry the + /// scheduler calls `target.on_timeout(pid, wait_seq)`. The target then + /// decides whether `pid` was actually still waiting, and if so unparks + /// it with whatever error the wait was bounded for. `wait_seq` lets the + /// target tell apart "this wait" from "a later wait by the same actor + /// on the same target". + WaitTimeout { + target: Arc, + wait_seq: u64, + }, +} + +/// Callback the scheduler invokes when a `WaitTimeout` entry pops. +/// +/// Implementors: do not touch `SchedulerState` other than via the public +/// `unpark` / channel APIs. The scheduler is mid-iteration when this fires. +pub trait TimerTarget: Send + Sync { + fn on_timeout(&self, pid: Pid, wait_seq: u64); +} + +pub struct Entry { + pub deadline: Instant, + /// Insertion order, used purely as a tiebreaker so `Entry: Ord` works + /// without having to compare the `Reason` payload (which contains an + /// `Rc` and isn't `Ord`). + seq: u64, + pub pid: Pid, + pub reason: Reason, +} + +impl PartialEq for Entry { + fn eq(&self, other: &Self) -> bool { + self.deadline == other.deadline && self.seq == other.seq + } +} +impl Eq for Entry {} + +impl Ord for Entry { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + // Earlier deadline first; ties broken by insertion order so the + // ordering is total. `Reason` and `Pid` deliberately don't + // participate. + self.deadline.cmp(&other.deadline).then_with(|| self.seq.cmp(&other.seq)) + } +} + +impl PartialOrd for Entry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +#[derive(Default)] +pub struct Timers { + /// Reverse-wrapped so the smallest deadline is at the top. + heap: BinaryHeap>, + /// Monotonic counter for the tiebreaker `seq` field. + next_seq: u64, +} + +impl Timers { + pub fn new() -> Self { + Self { heap: BinaryHeap::new(), next_seq: 0 } + } + + /// Insert a `Sleep` timer. Convenience for the common case. + pub fn insert_sleep(&mut self, deadline: Instant, pid: Pid) { + self.insert(deadline, pid, Reason::Sleep); + } + + /// Insert an arbitrary timer entry. + pub fn insert(&mut self, deadline: Instant, pid: Pid, reason: Reason) { + let seq = self.next_seq; + self.next_seq = self.next_seq.wrapping_add(1); + self.heap.push(Reverse(Entry { deadline, seq, pid, reason })); + } + + pub fn is_empty(&self) -> bool { + self.heap.is_empty() + } + + /// Soonest pending deadline, or `None` if the heap is empty. + pub fn peek_deadline(&self) -> Option { + self.heap.peek().map(|r| r.0.deadline) + } + + /// Pop every entry whose deadline is ≤ `now`, in deadline order. + /// The scheduler dispatches each entry by inspecting `entry.reason`. + pub fn pop_due(&mut self, now: Instant) -> Vec { + let mut out = Vec::new(); + while let Some(r) = self.heap.peek() { + if r.0.deadline <= now { + out.push(self.heap.pop().unwrap().0); + } else { + break; + } + } + out + } +} + +/// Wall-clock duration helper exposed for `sleep` and `lock_timeout`. +pub fn deadline_from_now(duration: Duration) -> Instant { + Instant::now() + .checked_add(duration) + .unwrap_or_else(Instant::now) +} diff --git a/src/trace.rs b/src/trace.rs new file mode 100644 index 0000000..592b040 --- /dev/null +++ b/src/trace.rs @@ -0,0 +1,246 @@ +//! Structured per-event tracing for smarm. +//! +//! Enabled by `--features smarm-trace`. Zero cost without the feature. +//! +//! Architecture: MPSC. Every scheduler thread holds a thread-local Sender +//! clone (one mutex acquire per thread, on first use). A dedicated drain +//! thread owns the Receiver, batches records, and writes to a BufWriter. +//! The hot path (record()) is a single channel send — no mutex, no disk I/O. +//! +//! Usage: +//! cargo test --test runtime --features smarm-trace +//! +//! Output: smarm_trace.json in cwd, or $SMARM_TRACE_FILE. +//! View: https://ui.perfetto.dev or chrome://tracing + +#[cfg(feature = "smarm-trace")] +#[macro_export] +macro_rules! te { + ($kind:expr) => { $crate::trace::record($kind) }; +} + +#[cfg(not(feature = "smarm-trace"))] +#[macro_export] +macro_rules! te { + ($kind:expr) => { () }; +} + +#[cfg(feature = "smarm-trace")] +pub use inner::*; + +#[cfg(feature = "smarm-trace")] +mod inner { + use crate::pid::Pid; + use std::io::Write; + use std::sync::{mpsc, Mutex}; + use std::time::Instant; + + // ----------------------------------------------------------------------- + // Event kinds + // ----------------------------------------------------------------------- + + #[derive(Clone, Debug)] + pub enum Event { + // Actor lifecycle + Spawn { parent: Pid, child: Pid }, + Resume(Pid), + Yield(Pid), + Park(Pid), + Done(Pid), + // Wakeup paths + UnparkDirect(Pid), // unpark() saw Parked -> re-queued immediately + UnparkDeferred(Pid), // unpark() saw Runnable -> set pending_unpark flag + UnparkFlagConsumed(Pid), // scheduler saw flag on Park -> re-queued instead + // Channel + Send { sender: Pid, receiver: Option }, + RecvPark(Pid), + RecvWake(Pid), + // Queue + Enqueue(Pid), + Dequeue(Pid), + } + + // ----------------------------------------------------------------------- + // Wire format sent through the channel + // ----------------------------------------------------------------------- + + struct Record { + nanos: u64, // ns since open() + tid: u64, // OS thread id + event: Event, + } + + // Sentinel: drain thread flushes and exits when it receives this. + enum Msg { + Event(Record), + Flush, + } + + // ----------------------------------------------------------------------- + // Global sender + start time + // ----------------------------------------------------------------------- + + struct Global { + sender: mpsc::Sender, + start: Instant, + } + + static GLOBAL: Mutex> = Mutex::new(None); + + // Per-thread state: cached Sender clone + cached copy of start Instant. + // The Sender clone is taken once per thread (one mutex hit). + // The start Instant is copied alongside it — also one mutex hit per thread. + // record() never touches GLOBAL after that. + struct LocalState { + tx: mpsc::Sender, + start: Instant, + } + + thread_local! { + static LOCAL_STATE: std::cell::RefCell> = + std::cell::RefCell::new(None); + } + + // ----------------------------------------------------------------------- + // Lifecycle + // ----------------------------------------------------------------------- + + pub fn open() { + let path = std::env::var("SMARM_TRACE_FILE") + .unwrap_or_else(|_| "smarm_trace.json".to_owned()); + + let (tx, rx) = mpsc::channel::(); + let start = Instant::now(); + + *GLOBAL.lock().unwrap() = Some(Global { sender: tx, start }); + + // Drain thread: owns the Receiver, writes to disk. + let path_for_thread = path.clone(); + std::thread::Builder::new() + .name("smarm-trace-drain".into()) + .spawn(move || drain_thread(rx, &path_for_thread)) + .expect("failed to spawn trace drain thread"); + + eprintln!("[smarm-trace] writing to {}", path); + } + + /// Send a Flush sentinel and block until the drain thread finishes writing. + /// Called by Runtime::run after all scheduler threads have exited. + pub fn flush() { + // Drop the global sender so the drain thread's recv() returns Err + // after the Flush sentinel, signalling clean shutdown. + let sender = { + let mut g = GLOBAL.lock().unwrap(); + g.take().map(|g| g.sender) + }; + if let Some(tx) = sender { + let _ = tx.send(Msg::Flush); + // tx drops here — drain thread will see disconnected after Flush. + } + // Clear thread-local state. + LOCAL_STATE.with(|c| *c.borrow_mut() = None); + } + + // ----------------------------------------------------------------------- + // Hot path + // ----------------------------------------------------------------------- + + pub fn record(event: Event) { + // Disable preemption for the entire duration of record(). Any + // allocation here (mutex internals, channel send, lazy init) would + // trigger PreemptingAllocator -> maybe_preempt -> switch_to_scheduler, + // which would try to re-acquire inner.shared (already held at many + // te!() call sites) -> deadlock. Guard at the very top, before any + // allocation-capable call. + let was_enabled = crate::preempt::PREEMPTION_ENABLED + .with(|e| { let v = e.get(); e.set(false); v }); + + LOCAL_STATE.with(|cell| { + let mut opt = cell.borrow_mut(); + // Lazily initialise: one mutex hit per thread, ever. + if opt.is_none() { + if let Some(g) = GLOBAL.lock().unwrap().as_ref() { + let tx = g.sender.clone(); + *opt = Some(LocalState { tx, start: g.start }); + } + } + if let Some(ls) = opt.as_ref() { + let nanos = ls.start.elapsed().as_nanos() as u64; + let tid = os_tid(); + let _ = ls.tx.send(Msg::Event(Record { nanos, tid, event })); + } + }); + + crate::preempt::PREEMPTION_ENABLED.with(|e| e.set(was_enabled)); + } + + // ----------------------------------------------------------------------- + // Drain thread + // ----------------------------------------------------------------------- + + fn drain_thread(rx: mpsc::Receiver, path: &str) { + let f = match std::fs::File::create(path) { + Ok(f) => f, + Err(e) => { eprintln!("[smarm-trace] create failed: {}", e); return; } + }; + let mut w = std::io::BufWriter::new(f); + let _ = writeln!(w, "{{\"traceEvents\":["); + + let mut count: u64 = 0; + let mut first = true; + + loop { + match rx.recv() { + Ok(Msg::Event(r)) => { + let (name, actor_idx) = chrome_fields(&r.event); + let ts_us = r.nanos as f64 / 1000.0; + if !first { let _ = w.write_all(b",\n"); } + first = false; + let _ = write!(w, + "{{\"ph\":\"i\",\"ts\":{:.3},\"pid\":{},\"tid\":{},\"name\":{:?},\"s\":\"g\"}}", + ts_us, actor_idx, r.tid, name); + count += 1; + } + Ok(Msg::Flush) | Err(_) => { + // Clean close. + let _ = writeln!(w, "\n]}}"); + let _ = w.flush(); + eprintln!("[smarm-trace] {} events written", count); + return; + } + } + } + } + + // ----------------------------------------------------------------------- + // Chrome trace helpers + // ----------------------------------------------------------------------- + + fn chrome_fields(ev: &Event) -> (String, u32) { + match ev { + Event::Spawn { parent, child } => + (format!("spawn c={}", child.index()), parent.index()), + Event::Resume(p) => ("resume".into(), p.index()), + Event::Yield(p) => ("yield".into(), p.index()), + Event::Park(p) => ("park".into(), p.index()), + Event::Done(p) => ("done".into(), p.index()), + Event::UnparkDirect(p) => ("unpark_direct".into(), p.index()), + Event::UnparkDeferred(p) => ("unpark_deferred".into(), p.index()), + Event::UnparkFlagConsumed(p) => ("unpark_flag_consumed".into(), p.index()), + Event::Send { sender, receiver } => ( + format!("send rx={}", receiver + .map(|p| p.index().to_string()) + .unwrap_or_else(|| "none".into())), + sender.index(), + ), + Event::RecvPark(p) => ("recv_park".into(), p.index()), + Event::RecvWake(p) => ("recv_wake".into(), p.index()), + Event::Enqueue(p) => ("enqueue".into(), p.index()), + Event::Dequeue(p) => ("dequeue".into(), p.index()), + } + } + + fn os_tid() -> u64 { + unsafe { libc::syscall(libc::SYS_gettid) as u64 } + } +} diff --git a/tests/channel.rs b/tests/channel.rs new file mode 100644 index 0000000..ed87c8d --- /dev/null +++ b/tests/channel.rs @@ -0,0 +1,110 @@ +//! Channel tests. These run under the scheduler because `recv()` needs to +//! be able to park, which requires a live runtime. + +use smarm::{channel, run, spawn}; +use std::cell::Cell; + +thread_local! { + static OUT: Cell = const { Cell::new(0) }; +} + +#[test] +fn send_then_recv_same_actor() { + OUT.with(|c| c.set(0)); + run(|| { + let (tx, rx) = channel::(); + tx.send(42).unwrap(); + let v = rx.recv().unwrap(); + OUT.with(|c| c.set(v)); + }); + assert_eq!(OUT.with(|c| c.get()), 42); +} + +#[test] +fn recv_parks_until_send_from_other_actor() { + OUT.with(|c| c.set(0)); + run(|| { + let (tx, rx) = channel::(); + let h = spawn(move || { + // This actor blocks on an empty channel. + let v = rx.recv().unwrap(); + OUT.with(|c| c.set(v)); + }); + // Parent runs, then yields to let the child block, + // then sends, then joins. + smarm::yield_now(); + tx.send(7).unwrap(); + h.join().unwrap(); + }); + assert_eq!(OUT.with(|c| c.get()), 7); +} + +#[test] +fn multiple_messages_arrive_in_order() { + let captured: std::sync::Arc>> = + std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let cap2 = captured.clone(); + + run(move || { + let (tx, rx) = channel::(); + let h = spawn(move || { + for _ in 0..3 { + let v = rx.recv().unwrap(); + cap2.lock().unwrap().push(v); + } + }); + for v in 1..=3i64 { + tx.send(v).unwrap(); + } + h.join().unwrap(); + }); + + assert_eq!(*captured.lock().unwrap(), vec![1, 2, 3]); +} + +#[test] +fn cloned_senders_both_deliver() { + let captured: std::sync::Arc>> = + std::sync::Arc::new(std::sync::Mutex::new(Vec::new())); + let cap2 = captured.clone(); + + run(move || { + let (tx, rx) = channel::(); + let tx2 = tx.clone(); + let h = spawn(move || { + for _ in 0..2 { + let v = rx.recv().unwrap(); + cap2.lock().unwrap().push(v); + } + }); + tx.send(10).unwrap(); + tx2.send(20).unwrap(); + h.join().unwrap(); + }); + + let mut got = captured.lock().unwrap().clone(); + got.sort(); + assert_eq!(got, vec![10, 20]); +} + +#[test] +fn recv_returns_err_when_all_senders_dropped() { + let saw_err: std::sync::Arc = + std::sync::Arc::new(std::sync::atomic::AtomicBool::new(false)); + let saw_err2 = saw_err.clone(); + + run(move || { + let (tx, rx) = channel::(); + let h = spawn(move || { + // Receiver waits; no message will ever come. + if rx.recv().is_err() { + saw_err2.store(true, std::sync::atomic::Ordering::SeqCst); + } + }); + smarm::yield_now(); + drop(tx); // last sender gone; rx.recv must return Err. + h.join().unwrap(); + }); + + assert!(saw_err.load(std::sync::atomic::Ordering::SeqCst)); +} diff --git a/tests/context.rs b/tests/context.rs new file mode 100644 index 0000000..150bcb1 --- /dev/null +++ b/tests/context.rs @@ -0,0 +1,137 @@ +//! Low-level context-switch tests. These poke `init_actor_stack` and the +//! naked asm shims directly — no scheduler involved. + +use smarm::context::{ + get_actor_sp, init_actor_stack, set_actor_sp, switch_to_actor, switch_to_scheduler, +}; +use smarm::stack::Stack; +use std::cell::Cell; + +thread_local! { + static LOG: Cell = const { Cell::new(0) }; +} + +fn log(v: u64) { LOG.with(|c| c.set(c.get() | v)); } +fn get_log() -> u64 { LOG.with(|c| c.get()) } +fn reset_log() { LOG.with(|c| c.set(0)); } + +extern "C-unwind" fn actor_simple() { + log(0x1); + unsafe { switch_to_scheduler() }; +} + +#[test] +fn actor_runs_and_returns_to_scheduler() { + reset_log(); + let stack = Stack::new(64 * 1024).unwrap(); + let sp = init_actor_stack(stack.top(), actor_simple); + set_actor_sp(sp); + unsafe { switch_to_actor() }; + assert_eq!(get_log(), 0x1); +} + +extern "C-unwind" fn actor_two_steps() { + log(0x1); + unsafe { switch_to_scheduler() }; + log(0x2); + unsafe { switch_to_scheduler() }; +} + +#[test] +fn actor_yields_and_resumes() { + reset_log(); + let stack = Stack::new(64 * 1024).unwrap(); + let sp = init_actor_stack(stack.top(), actor_two_steps); + set_actor_sp(sp); + + unsafe { switch_to_actor() }; + assert_eq!(get_log(), 0x1, "after first resume"); + + unsafe { switch_to_actor() }; + assert_eq!(get_log(), 0x1 | 0x2, "after second resume"); +} + +// Callee-saved registers must survive a yield. + +use std::sync::OnceLock; + +static REG_BEFORE: OnceLock<[u64; 4]> = OnceLock::new(); +static REG_AFTER: OnceLock<[u64; 4]> = OnceLock::new(); + +extern "C-unwind" fn actor_reg_check() { + unsafe { + let s0: u64 = 0xAAAA_BBBB_0000_0001; + let s1: u64 = 0xCCCC_DDDD_0000_0002; + let s2: u64 = 0xEEEE_FFFF_0000_0003; + let s3: u64 = 0x1111_2222_0000_0004; + + core::arch::asm!( + "mov r12, {s0}", "mov r13, {s1}", "mov r14, {s2}", "mov r15, {s3}", + s0 = in(reg) s0, s1 = in(reg) s1, s2 = in(reg) s2, s3 = in(reg) s3, + out("r12") _, out("r13") _, out("r14") _, out("r15") _, + ); + REG_BEFORE.set([s0, s1, s2, s3]).ok(); + switch_to_scheduler(); + + let a0: u64; let a1: u64; let a2: u64; let a3: u64; + core::arch::asm!( + "mov {a0}, r12", "mov {a1}, r13", "mov {a2}, r14", "mov {a3}, r15", + a0 = out(reg) a0, a1 = out(reg) a1, a2 = out(reg) a2, a3 = out(reg) a3, + ); + REG_AFTER.set([a0, a1, a2, a3]).ok(); + switch_to_scheduler(); + } +} + +#[test] +fn callee_saved_registers_survive_yield() { + let stack = Stack::new(64 * 1024).unwrap(); + let sp = init_actor_stack(stack.top(), actor_reg_check); + set_actor_sp(sp); + unsafe { switch_to_actor(); switch_to_actor(); } + assert_eq!(REG_BEFORE.get().copied().unwrap(), REG_AFTER.get().copied().unwrap()); +} + +// Two actors, independent stacks. + +thread_local! { + static A_VAL: Cell = const { Cell::new(0) }; + static B_VAL: Cell = const { Cell::new(0) }; +} + +extern "C-unwind" fn actor_a() { + A_VAL.with(|c| c.set(0xAAAA)); + unsafe { switch_to_scheduler() }; + let v = A_VAL.with(|c| c.get()); + A_VAL.with(|c| c.set(if v == 0xAAAA { 0xA00D } else { 0xDEAD })); + unsafe { switch_to_scheduler() }; +} + +extern "C-unwind" fn actor_b() { + B_VAL.with(|c| c.set(0xBBBB)); + unsafe { switch_to_scheduler() }; + let v = B_VAL.with(|c| c.get()); + B_VAL.with(|c| c.set(if v == 0xBBBB { 0xB00D } else { 0xDEAD })); + unsafe { switch_to_scheduler() }; +} + +#[test] +fn two_actors_dont_corrupt_each_other() { + let stack_a = Stack::new(64 * 1024).unwrap(); + let stack_b = Stack::new(64 * 1024).unwrap(); + + let sp_a = init_actor_stack(stack_a.top(), actor_a); + let sp_b = init_actor_stack(stack_b.top(), actor_b); + + set_actor_sp(sp_a); unsafe { switch_to_actor() }; + let sp_a = get_actor_sp(); + + set_actor_sp(sp_b); unsafe { switch_to_actor() }; + let sp_b = get_actor_sp(); + + set_actor_sp(sp_a); unsafe { switch_to_actor() }; + set_actor_sp(sp_b); unsafe { switch_to_actor() }; + + assert_eq!(A_VAL.with(|c| c.get()), 0xA00D); + assert_eq!(B_VAL.with(|c| c.get()), 0xB00D); +} diff --git a/tests/io.rs b/tests/io.rs new file mode 100644 index 0000000..820ee66 --- /dev/null +++ b/tests/io.rs @@ -0,0 +1,99 @@ +//! Tests for `block_on_io` — running a blocking closure on a worker OS +//! thread while the calling actor is parked. + +use smarm::{block_on_io, run, spawn, yield_now}; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::{Arc, Mutex}; +use std::time::Duration; + +#[test] +fn block_on_io_returns_the_closures_value() { + let captured: Arc>> = Arc::new(Mutex::new(None)); + let c = captured.clone(); + run(move || { + let v: u64 = block_on_io(|| { + // Burn a tiny bit of time so this actually crosses thread. + std::thread::sleep(Duration::from_millis(5)); + 42 + }); + *c.lock().unwrap() = Some(v); + }); + assert_eq!(*captured.lock().unwrap(), Some(42)); +} + +#[test] +fn other_actors_run_while_block_on_io_is_in_flight() { + // While actor A is parked in block_on_io, actor B should be able to + // make progress. + let order: Arc>> = Arc::new(Mutex::new(Vec::new())); + let oa = order.clone(); + let ob = order.clone(); + + run(move || { + let a = spawn(move || { + oa.lock().unwrap().push(1); // A starts first. + block_on_io(|| { + std::thread::sleep(Duration::from_millis(50)); + }); + oa.lock().unwrap().push(4); // A resumes last. + }); + let b = spawn(move || { + // Make sure A enters block_on_io first. + yield_now(); + ob.lock().unwrap().push(2); + yield_now(); + ob.lock().unwrap().push(3); + }); + a.join().unwrap(); + b.join().unwrap(); + }); + + // Required interleaving: 1 (A starts) before 2,3 (B runs while A + // is parked), and 4 (A resumes) after 2,3. + let v = order.lock().unwrap(); + assert_eq!(v[0], 1, "log: {:?}", *v); + assert_eq!(v[v.len() - 1], 4, "log: {:?}", *v); + let pos_2 = v.iter().position(|&x| x == 2).unwrap(); + let pos_3 = v.iter().position(|&x| x == 3).unwrap(); + let pos_4 = v.iter().position(|&x| x == 4).unwrap(); + assert!(pos_2 < pos_4, "B's first step ran after A resumed: {:?}", *v); + assert!(pos_3 < pos_4, "B's second step ran after A resumed: {:?}", *v); +} + +#[test] +fn many_concurrent_block_on_io_calls_all_complete() { + let counter = Arc::new(AtomicU32::new(0)); + let c = counter.clone(); + run(move || { + let mut handles = Vec::new(); + for _ in 0..10 { + let cc = c.clone(); + handles.push(spawn(move || { + let n: u32 = block_on_io(|| { + std::thread::sleep(Duration::from_millis(10)); + 1 + }); + cc.fetch_add(n, Ordering::SeqCst); + })); + } + for h in handles { h.join().unwrap(); } + }); + assert_eq!(counter.load(Ordering::SeqCst), 10); +} + +#[test] +fn block_on_io_panic_propagates_to_caller() { + let saw_err = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let s = saw_err.clone(); + run(move || { + let h = spawn(move || { + // The closure panics on the worker thread; that should + // resurface as a panic in this actor. + let _: () = block_on_io(|| panic!("boom on io thread")); + }); + if h.join().is_err() { + s.store(true, Ordering::SeqCst); + } + }); + assert!(saw_err.load(Ordering::SeqCst)); +} diff --git a/tests/io_epoll.rs b/tests/io_epoll.rs new file mode 100644 index 0000000..a28516a --- /dev/null +++ b/tests/io_epoll.rs @@ -0,0 +1,324 @@ +//! Tests for epoll-based fd readiness primitives: `wait_readable`, +//! `wait_writable`, and the `read`/`write` sugar on top of them. +//! +//! Pipes are the convenient test target: cheap to create, easy to drive, +//! and we already use `libc::pipe2` internally. Each pipe is one direction +//! and respects `O_NONBLOCK` if we ask for it. + +use smarm::{run, spawn, wait_readable, wait_writable, yield_now}; +use std::os::fd::RawFd; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::sync::Arc; +use std::sync::Mutex as StdMutex; +use std::time::Duration; + +// --------------------------------------------------------------------------- +// Pipe helper +// --------------------------------------------------------------------------- + +struct Pipe { + read: RawFd, + write: RawFd, +} + +impl Pipe { + fn new() -> Self { + let mut fds: [libc::c_int; 2] = [0; 2]; + let r = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC | libc::O_NONBLOCK) }; + assert_eq!(r, 0, "pipe2 failed"); + Pipe { + read: fds[0], + write: fds[1], + } + } +} + +impl Drop for Pipe { + fn drop(&mut self) { + unsafe { + libc::close(self.read); + libc::close(self.write); + } + } +} + +fn raw_write(fd: RawFd, buf: &[u8]) -> isize { + unsafe { libc::write(fd, buf.as_ptr() as *const _, buf.len()) } +} + +fn raw_read(fd: RawFd, buf: &mut [u8]) -> isize { + unsafe { libc::read(fd, buf.as_mut_ptr() as *mut _, buf.len()) } +} + +// --------------------------------------------------------------------------- +// wait_readable parks until data arrives, then libc::read succeeds. +// --------------------------------------------------------------------------- + +#[test] +fn wait_readable_blocks_until_data_arrives_then_read_succeeds() { + let captured: Arc>> = Arc::new(StdMutex::new(Vec::new())); + let cap = captured.clone(); + + let p = Arc::new(Pipe::new()); + let p_reader = p.clone(); + let p_writer = p.clone(); + + run(move || { + let reader = spawn(move || { + // Initially the pipe is empty; this parks. + wait_readable(p_reader.read).expect("wait_readable failed"); + // Now data should be readable. + let mut buf = [0u8; 16]; + let n = raw_read(p_reader.read, &mut buf); + assert!(n > 0, "read returned {}", n); + cap.lock().unwrap().extend_from_slice(&buf[..n as usize]); + }); + + let writer = spawn(move || { + // Yield so the reader gets to park first. + yield_now(); + yield_now(); + // Sleep a touch so the reader is definitely waiting in epoll. + smarm::sleep(Duration::from_millis(5)); + let n = raw_write(p_writer.write, b"hello"); + assert_eq!(n, 5); + }); + + reader.join().unwrap(); + writer.join().unwrap(); + }); + + assert_eq!(*captured.lock().unwrap(), b"hello"); +} + +// --------------------------------------------------------------------------- +// The smarm::scheduler::read sugar — wait_readable + libc::read in one call. +// --------------------------------------------------------------------------- + +#[test] +fn read_sugar_returns_bytes_from_pipe() { + let captured: Arc>> = Arc::new(StdMutex::new(Vec::new())); + let cap = captured.clone(); + + let p = Arc::new(Pipe::new()); + let p_reader = p.clone(); + let p_writer = p.clone(); + + run(move || { + let reader = spawn(move || { + let mut buf = [0u8; 16]; + let n = smarm::scheduler::read(p_reader.read, &mut buf) + .expect("smarm::scheduler::read failed"); + cap.lock().unwrap().extend_from_slice(&buf[..n]); + }); + + let writer = spawn(move || { + yield_now(); + smarm::sleep(Duration::from_millis(5)); + let _ = raw_write(p_writer.write, b"world"); + }); + + reader.join().unwrap(); + writer.join().unwrap(); + }); + + assert_eq!(*captured.lock().unwrap(), b"world"); +} + +// --------------------------------------------------------------------------- +// wait_writable + write — though pipes are almost always writable; the +// useful test here is that the call doesn't hang on a writable fd. +// --------------------------------------------------------------------------- + +#[test] +fn write_sugar_sends_bytes_to_pipe() { + let counter = Arc::new(AtomicU32::new(0)); + let c = counter.clone(); + + let p = Arc::new(Pipe::new()); + let p_writer = p.clone(); + let p_reader = p.clone(); + + run(move || { + let writer = spawn(move || { + // Pipe is empty + has buffer space, so this returns immediately + // after wait_writable wakes (which happens fast because the + // kernel marks an empty pipe as immediately writable). + let n = smarm::scheduler::write(p_writer.write, b"smarm") + .expect("write failed"); + assert_eq!(n, 5); + c.fetch_add(1, Ordering::SeqCst); + }); + + let reader = spawn(move || { + // Give the writer time. + smarm::sleep(Duration::from_millis(10)); + let mut buf = [0u8; 16]; + let n = raw_read(p_reader.read, &mut buf); + assert_eq!(n, 5); + assert_eq!(&buf[..5], b"smarm"); + }); + + writer.join().unwrap(); + reader.join().unwrap(); + }); + + assert_eq!(counter.load(Ordering::SeqCst), 1); +} + +// --------------------------------------------------------------------------- +// While an actor is parked on wait_readable, other actors keep running. +// --------------------------------------------------------------------------- + +#[test] +fn other_actors_run_while_one_is_parked_on_wait_readable() { + let log: Arc>> = Arc::new(StdMutex::new(Vec::new())); + let la = log.clone(); + let lb = log.clone(); + + let p = Arc::new(Pipe::new()); + let p_a = p.clone(); + let p_b = p.clone(); + + run(move || { + let a = spawn(move || { + la.lock().unwrap().push(b'A'); + wait_readable(p_a.read).unwrap(); + la.lock().unwrap().push(b'a'); + }); + + let b = spawn(move || { + // A starts parking on the empty pipe; B should be free to do + // its work in the meantime. + for _ in 0..3 { + yield_now(); + lb.lock().unwrap().push(b'B'); + } + // Now wake A. + let _ = raw_write(p_b.write, b"x"); + }); + + a.join().unwrap(); + b.join().unwrap(); + }); + + let v = log.lock().unwrap(); + // A goes first ('A'), then B makes progress (multiple 'B's) while A is + // parked, then A wakes and finishes ('a'). + let pos_big_a = v.iter().position(|&c| c == b'A').unwrap(); + let pos_lit_a = v.iter().position(|&c| c == b'a').unwrap(); + let big_b_count = v.iter().filter(|&&c| c == b'B').count(); + assert_eq!(big_b_count, 3, "B should have made 3 steps: {:?}", *v); + assert!(pos_big_a < pos_lit_a, "A pre-park before A post-park: {:?}", *v); + // At least the last B step should be before A resumes. + let last_big_b = v.iter().rposition(|&c| c == b'B').unwrap(); + assert!(last_big_b < pos_lit_a, "B should finish before A resumes: {:?}", *v); +} + +// --------------------------------------------------------------------------- +// Two-way pipe ping-pong via wait_readable. +// --------------------------------------------------------------------------- + +#[test] +fn ping_pong_between_two_pipes_completes() { + // a_to_b: actor A writes, actor B reads. + // b_to_a: actor B writes, actor A reads. + let a_to_b = Arc::new(Pipe::new()); + let b_to_a = Arc::new(Pipe::new()); + + let counter = Arc::new(AtomicU32::new(0)); + let ca = counter.clone(); + let cb = counter.clone(); + + let a_to_b_a = a_to_b.clone(); + let a_to_b_b = a_to_b.clone(); + let b_to_a_a = b_to_a.clone(); + let b_to_a_b = b_to_a.clone(); + + run(move || { + let a = spawn(move || { + for _ in 0..5 { + let _ = raw_write(a_to_b_a.write, b"x"); + wait_readable(b_to_a_a.read).unwrap(); + let mut buf = [0u8; 4]; + let _ = raw_read(b_to_a_a.read, &mut buf); + ca.fetch_add(1, Ordering::SeqCst); + } + }); + + let b = spawn(move || { + for _ in 0..5 { + wait_readable(a_to_b_b.read).unwrap(); + let mut buf = [0u8; 4]; + let _ = raw_read(a_to_b_b.read, &mut buf); + let _ = raw_write(b_to_a_b.write, b"y"); + cb.fetch_add(1, Ordering::SeqCst); + } + }); + + a.join().unwrap(); + b.join().unwrap(); + }); + + // Both sides did 5 rounds; counter is incremented by both, so total = 10. + assert_eq!(counter.load(Ordering::SeqCst), 10); +} + +// --------------------------------------------------------------------------- +// Same fd reused across calls — DEL+ADD cycle works. +// --------------------------------------------------------------------------- + +#[test] +fn same_fd_can_be_waited_on_repeatedly() { + let p = Arc::new(Pipe::new()); + let p_r = p.clone(); + let p_w = p.clone(); + let counter = Arc::new(AtomicU32::new(0)); + let c = counter.clone(); + + run(move || { + let reader = spawn(move || { + for _ in 0..4 { + wait_readable(p_r.read).unwrap(); + let mut buf = [0u8; 4]; + let n = raw_read(p_r.read, &mut buf); + assert!(n > 0); + c.fetch_add(1, Ordering::SeqCst); + } + }); + + let writer = spawn(move || { + for _ in 0..4 { + yield_now(); + smarm::sleep(Duration::from_millis(2)); + let _ = raw_write(p_w.write, b"z"); + } + }); + + reader.join().unwrap(); + writer.join().unwrap(); + }); + + assert_eq!(counter.load(Ordering::SeqCst), 4); +} + +// --------------------------------------------------------------------------- +// Sanity that wait_writable on an already-writable pipe returns promptly. +// --------------------------------------------------------------------------- + +#[test] +fn wait_writable_on_empty_pipe_returns_quickly() { + let p = Arc::new(Pipe::new()); + let p_w = p.clone(); + + let start = std::time::Instant::now(); + run(move || { + wait_writable(p_w.write).unwrap(); + }); + let elapsed = start.elapsed(); + assert!( + elapsed < Duration::from_millis(200), + "wait_writable should be fast on a writable fd, took {:?}", + elapsed + ); +} diff --git a/tests/mutex.rs b/tests/mutex.rs new file mode 100644 index 0000000..6647c22 --- /dev/null +++ b/tests/mutex.rs @@ -0,0 +1,314 @@ +//! `loom::Mutex` tests. All run under the scheduler because `lock()` +//! needs to be able to park. + +use smarm::{run, spawn, yield_now, LockTimeout, Mutex}; +use std::sync::Arc; +use std::sync::Mutex as StdMutex; +use std::sync::atomic::{AtomicU32, Ordering}; +use std::time::{Duration, Instant}; + +// --------------------------------------------------------------------------- +// Uncontended fast path +// --------------------------------------------------------------------------- + +#[test] +fn lock_free_mutex_succeeds() { + let captured = Arc::new(AtomicU32::new(0)); + let c = captured.clone(); + run(move || { + let m = Mutex::new(42u32); + { + let g = m.lock_timeout(Duration::from_millis(500)).unwrap(); + c.store(*g, Ordering::SeqCst); + } + // After drop we can lock again. + let g2 = m.lock_timeout(Duration::from_millis(500)).unwrap(); + assert_eq!(*g2, 42); + }); + assert_eq!(captured.load(Ordering::SeqCst), 42); +} + +#[test] +fn try_lock_returns_some_when_free_none_when_held() { + let success_flag = Arc::new(AtomicU32::new(0)); + let s = success_flag.clone(); + run(move || { + let m = Mutex::new(0u32); + let g = m.try_lock().expect("free"); + // Holding the guard; a second try_lock on the same actor should fail. + assert!(m.try_lock().is_none()); + drop(g); + // Now free again. + let g2 = m.try_lock().expect("free again"); + drop(g2); + s.store(1, Ordering::SeqCst); + }); + assert_eq!(success_flag.load(Ordering::SeqCst), 1); +} + +#[test] +fn guard_mutates_value_visible_through_next_lock() { + let final_value = Arc::new(AtomicU32::new(0)); + let f = final_value.clone(); + run(move || { + let m = Mutex::new(0u32); + { + let mut g = m.lock_timeout(Duration::from_millis(500)).unwrap(); + *g = 7; + } + let g2 = m.lock_timeout(Duration::from_millis(500)).unwrap(); + f.store(*g2, Ordering::SeqCst); + }); + assert_eq!(final_value.load(Ordering::SeqCst), 7); +} + +// --------------------------------------------------------------------------- +// Contention: a second actor parks until the first releases. +// --------------------------------------------------------------------------- + +#[test] +fn contended_lock_parks_until_holder_releases() { + // Actor A locks, yields (still holding), then releases. Actor B tries + // to lock in between — B should park, then succeed after A drops. + let log: Arc>> = Arc::new(StdMutex::new(Vec::new())); + let la = log.clone(); + let lb = log.clone(); + + run(move || { + let m = Mutex::new(0u32); + let m_a = m.clone(); + let m_b = m.clone(); + + let a = spawn(move || { + let g = m_a.lock_timeout(Duration::from_millis(500)).unwrap(); + la.lock().unwrap().push("A_locked"); + // First yield: lets B run past its first yield_now. + yield_now(); + // Second yield: lets B reach B_try and attempt lock() while we + // still hold it, so B parks on the mutex. + yield_now(); + la.lock().unwrap().push("A_dropping"); + drop(g); + la.lock().unwrap().push("A_dropped"); + }); + let b = spawn(move || { + // One yield: lets A run and acquire the lock first. + yield_now(); + lb.lock().unwrap().push("B_try"); + let _g = m_b.lock_timeout(Duration::from_millis(500)).unwrap(); + lb.lock().unwrap().push("B_locked"); + }); + a.join().unwrap(); + b.join().unwrap(); + }); + + let v = log.lock().unwrap(); + // A locks, B tries (parks), A drops, B gets the lock. + let pos_a_locked = v.iter().position(|s| *s == "A_locked").unwrap(); + let pos_b_try = v.iter().position(|s| *s == "B_try").unwrap(); + let pos_a_dropped = v.iter().position(|s| *s == "A_dropped").unwrap(); + let pos_b_locked = v.iter().position(|s| *s == "B_locked").unwrap(); + + assert!(pos_a_locked < pos_b_try, "log: {:?}", *v); + assert!(pos_b_try < pos_a_dropped, "B should attempt before A drops: {:?}", *v); + assert!(pos_a_dropped < pos_b_locked, "B should lock only after A drops: {:?}", *v); +} + +// --------------------------------------------------------------------------- +// Timeout: B times out while A holds forever. +// --------------------------------------------------------------------------- + +#[test] +fn lock_timeout_returns_err_when_holder_never_releases() { + let saw_err = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let s = saw_err.clone(); + + run(move || { + let m: Mutex = Mutex::new(0); + let m_a = m.clone(); + let m_b = m.clone(); + + let a = spawn(move || { + // Hold the lock for 100ms, blocking B's attempt with a 20ms timeout. + let _g = m_a.lock_timeout(Duration::from_millis(500)).unwrap(); + smarm::sleep(Duration::from_millis(100)); + // _g drops here. + }); + let b = spawn(move || { + // Let A acquire first. + yield_now(); + let t0 = Instant::now(); + let res = m_b.lock_timeout(Duration::from_millis(20)); + let elapsed = t0.elapsed(); + assert!(matches!(res, Err(LockTimeout)), "got {:?}", res); + // Sanity: actually waited approximately the timeout. + assert!( + elapsed >= Duration::from_millis(15), + "timed out too fast: {:?}", + elapsed + ); + assert!( + elapsed < Duration::from_millis(80), + "timed out far too slow: {:?}", + elapsed + ); + s.store(true, Ordering::SeqCst); + }); + a.join().unwrap(); + b.join().unwrap(); + }); + + assert!(saw_err.load(Ordering::SeqCst)); +} + +// --------------------------------------------------------------------------- +// FIFO fairness: when many actors queue, they get the lock in arrival order. +// --------------------------------------------------------------------------- + +#[test] +fn waiters_are_granted_the_lock_in_fifo_order() { + let order: Arc>> = Arc::new(StdMutex::new(Vec::new())); + + run({ + let order = order.clone(); + move || { + let m: Mutex<()> = Mutex::new(()); + + // Holder: takes the lock, yields to let others queue up, then + // releases. Each waiter records its arrival order on acquisition. + let m_holder = m.clone(); + let holder = spawn(move || { + let g = m_holder.lock_timeout(Duration::from_millis(500)).unwrap(); + // Let waiters pile up. + for _ in 0..5 { + yield_now(); + } + drop(g); + }); + + // Spawn 4 waiters in order 1, 2, 3, 4. Each yields once before + // calling lock(), so we know the holder ran first. + let mut handles = vec![holder]; + for id in 1u32..=4 { + let m_w = m.clone(); + let o = order.clone(); + handles.push(spawn(move || { + // Stagger the lock attempts so they arrive in order. + for _ in 0..id { + yield_now(); + } + let _g = m_w.lock_timeout(Duration::from_millis(500)).unwrap(); + o.lock().unwrap().push(id); + })); + } + for h in handles { + h.join().unwrap(); + } + } + }); + + let v = order.lock().unwrap().clone(); + assert_eq!(v, vec![1, 2, 3, 4], "waiters should acquire in arrival order"); +} + +// --------------------------------------------------------------------------- +// Grant-vs-timeout race: holder drops just before timer would fire — waiter +// should get the lock, not LockTimeout. +// --------------------------------------------------------------------------- + +#[test] +fn grant_wins_when_holder_releases_before_timeout() { + let got_lock = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let g = got_lock.clone(); + + run(move || { + let m: Mutex = Mutex::new(0); + let m_a = m.clone(); + let m_b = m.clone(); + + let a = spawn(move || { + let _g = m_a.lock_timeout(Duration::from_millis(500)).unwrap(); + // Hold for 10ms, well under B's 100ms timeout. + smarm::sleep(Duration::from_millis(10)); + }); + let b = spawn(move || { + yield_now(); + let res = m_b.lock_timeout(Duration::from_millis(100)); + if res.is_ok() { + g.store(true, Ordering::SeqCst); + } + }); + a.join().unwrap(); + b.join().unwrap(); + }); + + assert!(got_lock.load(Ordering::SeqCst)); +} + +// --------------------------------------------------------------------------- +// Panic in critical section: next waiter still gets the lock (no poisoning). +// --------------------------------------------------------------------------- + +#[test] +fn next_waiter_gets_lock_after_holder_panics() { + let next_got_it = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let n = next_got_it.clone(); + + run(move || { + let m: Mutex = Mutex::new(7); + let m_a = m.clone(); + let m_b = m.clone(); + + let a = spawn(move || { + let _g = m_a.lock_timeout(Duration::from_millis(500)).unwrap(); + yield_now(); + panic!("holder dies mid-critical-section"); + }); + let b = spawn(move || { + yield_now(); + // A is dead but its guard's Drop ran during unwind. We get the lock. + let g = m_b.lock_timeout(Duration::from_millis(100)).unwrap(); + assert_eq!(*g, 7); + n.store(true, Ordering::SeqCst); + }); + let _ = a.join(); // panic — expected + b.join().unwrap(); + }); + + assert!(next_got_it.load(Ordering::SeqCst)); +} + +// --------------------------------------------------------------------------- +// Multiple short critical sections under contention all complete (no lost +// wakeups, no deadlock). Counts up to N from M actors. +// --------------------------------------------------------------------------- + +#[test] +fn many_actors_increment_shared_counter_via_mutex() { + const ACTORS: u32 = 8; + const PER_ACTOR: u32 = 50; + + let final_value = Arc::new(AtomicU32::new(0)); + let fv = final_value.clone(); + + run(move || { + let m: Mutex = Mutex::new(0); + let mut handles = Vec::new(); + for _ in 0..ACTORS { + let m_i = m.clone(); + handles.push(spawn(move || { + for _ in 0..PER_ACTOR { + let mut g = m_i.lock_timeout(Duration::from_millis(500)).unwrap(); + *g += 1; + } + })); + } + for h in handles { + h.join().unwrap(); + } + let g = m.lock_timeout(Duration::from_millis(500)).unwrap(); + fv.store(*g, Ordering::SeqCst); + }); + + assert_eq!(final_value.load(Ordering::SeqCst), ACTORS * PER_ACTOR); +} diff --git a/tests/pid.rs b/tests/pid.rs new file mode 100644 index 0000000..8ea410b --- /dev/null +++ b/tests/pid.rs @@ -0,0 +1,22 @@ +use smarm::pid::Pid; + +#[test] +fn pid_equality() { + assert_eq!(Pid::new(0, 0), Pid::new(0, 0)); + assert_ne!(Pid::new(0, 0), Pid::new(0, 1)); + assert_ne!(Pid::new(0, 0), Pid::new(1, 0)); +} + +#[test] +fn pid_accessors() { + let p = Pid::new(42, 7); + assert_eq!(p.index(), 42); + assert_eq!(p.generation(), 7); +} + +#[test] +fn pid_debug_is_useful() { + let p = Pid::new(3, 5); + let s = format!("{:?}", p); + assert!(s.contains('3') && s.contains('5'), "got: {}", s); +} diff --git a/tests/preempt.rs b/tests/preempt.rs new file mode 100644 index 0000000..3a04288 --- /dev/null +++ b/tests/preempt.rs @@ -0,0 +1,66 @@ +//! Tests for explicit preemption via `smarm::check!()`. + +use smarm::{run, spawn}; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::Arc; + +#[test] +fn check_yields_when_timeslice_expired() { + // A single actor that drives the timeslice clock to zero manually, + // then calls check!() and expects to yield. The scheduler has nothing + // else to run, so it just re-queues us. To prove we actually yielded, + // observe the run counter on the slot... we don't have one. So + // instead: spawn a second actor that increments a counter and joins + // it; verify both actors made progress in interleaved order under + // forced timeslice expiry. + let order: Arc>> = Arc::new(std::sync::Mutex::new(Vec::new())); + let o1 = order.clone(); + let o2 = order.clone(); + + run(move || { + let a = spawn(move || { + o1.lock().unwrap().push(b'A'); + // Force the timeslice to be considered expired. + smarm::preempt::expire_timeslice_for_test(); + smarm::check!(); + o1.lock().unwrap().push(b'a'); + }); + let b = spawn(move || { + o2.lock().unwrap().push(b'B'); + smarm::preempt::expire_timeslice_for_test(); + smarm::check!(); + o2.lock().unwrap().push(b'b'); + }); + a.join().unwrap(); + b.join().unwrap(); + }); + + // FIFO scheduling + forced preemption: A starts, expires, yields to B; + // B starts, expires, yields to A; A finishes, B finishes. + // Required: both uppercase letters appear before either lowercase. + let v = order.lock().unwrap(); + let pos_big_a = v.iter().position(|&c| c == b'A').unwrap(); + let pos_big_b = v.iter().position(|&c| c == b'B').unwrap(); + let pos_lit_a = v.iter().position(|&c| c == b'a').unwrap(); + let pos_lit_b = v.iter().position(|&c| c == b'b').unwrap(); + assert!(pos_big_a < pos_lit_a, "A's tail ran before B's head: {:?}", *v); + assert!(pos_big_b < pos_lit_b, "B's tail ran before A's head: {:?}", *v); + assert!(pos_big_a.max(pos_big_b) < pos_lit_a.min(pos_lit_b), + "preemption didn't interleave: {:?}", *v); +} + +#[test] +fn check_is_a_noop_when_timeslice_not_expired() { + // After a fresh resume, check!() should be cheap and not yield. Run + // a single actor that calls check!() many times; it should complete + // promptly. + let count = Arc::new(AtomicU64::new(0)); + let c = count.clone(); + run(move || { + for _ in 0..1_000 { + smarm::check!(); + c.fetch_add(1, Ordering::Relaxed); + } + }); + assert_eq!(count.load(Ordering::Relaxed), 1_000); +} diff --git a/tests/runtime.rs b/tests/runtime.rs new file mode 100644 index 0000000..e4c7b32 --- /dev/null +++ b/tests/runtime.rs @@ -0,0 +1,426 @@ +//! Tests for the multi-scheduler runtime: Config, Runtime::run, and +//! correctness under genuine parallelism. +//! +//! The single-threaded correctness properties (channel ordering, mutex +//! fairness, timer accuracy, etc.) are already covered by the per-module +//! tests. This file focuses on what changes when N > 1 scheduler threads +//! are involved: +//! +//! - Config construction and validation +//! - Runtime::run blocks until all actors finish +//! - All existing cooperative behaviours hold under multi-threading +//! - Actors genuinely run on different OS threads +//! - No lost wakeups under concurrent park/unpark +//! - No slot leaks under high spawn/join churn +//! - Panic on one scheduler thread doesn't kill others + +use smarm::{channel, runtime::{Config, Runtime}, spawn, yield_now, JoinHandle}; +use std::sync::{ + atomic::{AtomicBool, AtomicU64, AtomicUsize, Ordering}, + Arc, Barrier, +}; +use std::time::Duration; +use std::collections::HashSet; + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/// Build a runtime with exactly `n` scheduler threads. +fn rt(n: usize) -> Runtime { + smarm::runtime::init(Config::exact(n)) +} + +/// Convenient single-threaded runtime (regression guard). +fn rt1() -> Runtime { rt(1) } + +/// Multi-threaded runtime using all available parallelism. +fn rt_par() -> Runtime { + smarm::runtime::init(Config::default()) +} + +// --------------------------------------------------------------------------- +// Config +// --------------------------------------------------------------------------- + +#[test] +fn config_exact_overrides_bounds() { + let c = Config::exact(3); + assert_eq!(c.resolved_thread_count(), 3); +} + +#[test] +fn config_default_clamps_to_available_parallelism() { + let c = Config::default(); + let n = c.resolved_thread_count(); + let avail = std::thread::available_parallelism() + .map(|n| n.get()) + .unwrap_or(1); + // Default min is 1, default max is available_parallelism. + assert!(n >= 1 && n <= avail); +} + +#[test] +fn config_min_max_clamps() { + // Force a range that excludes exact: min=2, max=4, available might be >4. + let c = Config::new(2, 4, None); + let n = c.resolved_thread_count(); + assert!(n >= 2 && n <= 4, "expected 2..=4, got {n}"); +} + +#[test] +fn config_min_1_max_1_is_single_threaded() { + let c = Config::new(1, 1, None); + assert_eq!(c.resolved_thread_count(), 1); +} + +// --------------------------------------------------------------------------- +// Runtime::run — basic lifecycle +// --------------------------------------------------------------------------- + +#[test] +fn runtime_run_executes_closure() { + let flag = Arc::new(AtomicBool::new(false)); + let f = flag.clone(); + rt(1).run(move || { f.store(true, Ordering::SeqCst); }); + assert!(flag.load(Ordering::SeqCst)); +} + +#[test] +fn runtime_run_blocks_until_all_actors_done() { + // Spawn a chain of actors; the counter should be exactly N when run returns. + let counter = Arc::new(AtomicU64::new(0)); + let c = counter.clone(); + rt(2).run(move || { + let mut handles = Vec::new(); + for _ in 0..20 { + let cc = c.clone(); + handles.push(spawn(move || { + cc.fetch_add(1, Ordering::SeqCst); + })); + } + for h in handles { + h.join().unwrap(); + } + }); + assert_eq!(counter.load(Ordering::SeqCst), 20); +} + +#[test] +fn runtime_can_be_used_multiple_times_sequentially() { + // Each call to run() is independent. + let r = rt(2); + let a = Arc::new(AtomicU64::new(0)); + let b = Arc::new(AtomicU64::new(0)); + let ac = a.clone(); + let bc = b.clone(); + r.run(move || { ac.fetch_add(1, Ordering::SeqCst); }); + r.run(move || { bc.fetch_add(1, Ordering::SeqCst); }); + assert_eq!(a.load(Ordering::SeqCst), 1); + assert_eq!(b.load(Ordering::SeqCst), 1); +} + +// --------------------------------------------------------------------------- +// Single-threaded regression: exact(1) must behave identically to old run() +// --------------------------------------------------------------------------- + +#[test] +fn exact_1_spawn_join_works() { + let v = Arc::new(AtomicU64::new(0)); + let vc = v.clone(); + rt1().run(move || { + let h = spawn(move || { vc.store(42, Ordering::SeqCst); }); + h.join().unwrap(); + }); + assert_eq!(v.load(Ordering::SeqCst), 42); +} + +#[test] +fn exact_1_channel_recv_parks_and_wakes() { + let v = Arc::new(AtomicU64::new(0)); + let vc = v.clone(); + rt1().run(move || { + let (tx, rx) = channel::(); + let h = spawn(move || { + let val = rx.recv().unwrap(); + vc.store(val, Ordering::SeqCst); + }); + yield_now(); + tx.send(99).unwrap(); + h.join().unwrap(); + }); + assert_eq!(v.load(Ordering::SeqCst), 99); +} + +#[test] +fn exact_1_panic_captured() { + let saw_err = Arc::new(AtomicBool::new(false)); + let s = saw_err.clone(); + rt1().run(move || { + let h = spawn(|| panic!("oops")); + if h.join().is_err() { s.store(true, Ordering::SeqCst); } + }); + assert!(saw_err.load(Ordering::SeqCst)); +} + +// --------------------------------------------------------------------------- +// Multi-threaded correctness +// --------------------------------------------------------------------------- + +#[test] +fn multi_thread_all_actors_complete() { + let counter = Arc::new(AtomicU64::new(0)); + let c = counter.clone(); + rt_par().run(move || { + let mut handles = Vec::new(); + for _ in 0..100 { + let cc = c.clone(); + handles.push(spawn(move || { + cc.fetch_add(1, Ordering::SeqCst); + })); + } + for h in handles { h.join().unwrap(); } + }); + assert_eq!(counter.load(Ordering::SeqCst), 100); +} + +#[test] +fn multi_thread_channel_wakeup_across_threads() { + // Receiver parks; sender runs (potentially on a different OS thread). + // Verifies no lost wakeup. + let received = Arc::new(AtomicU64::new(0)); + let rc = received.clone(); + rt_par().run(move || { + let (tx, rx) = channel::(); + let h = spawn(move || { + let v = rx.recv().unwrap(); + rc.store(v, Ordering::SeqCst); + }); + // Let receiver park. + yield_now(); + tx.send(7).unwrap(); + h.join().unwrap(); + }); + assert_eq!(received.load(Ordering::SeqCst), 7); +} + +#[test] +fn multi_thread_many_channels_no_lost_wakeups() { + // N pairs of (sender actor, receiver actor). Each pair exchanges one + // message. All must complete — any lost wakeup causes a deadlock/timeout. + const PAIRS: usize = 50; + let count = Arc::new(AtomicU64::new(0)); + let c = count.clone(); + rt_par().run(move || { + let mut handles: Vec = Vec::new(); + for _ in 0..PAIRS { + let (tx, rx) = channel::(); + let cc = c.clone(); + handles.push(spawn(move || { + let v = rx.recv().unwrap(); + cc.fetch_add(v, Ordering::SeqCst); + })); + handles.push(spawn(move || { + tx.send(1).unwrap(); + })); + } + for h in handles { h.join().unwrap(); } + }); + assert_eq!(count.load(Ordering::SeqCst), PAIRS as u64); +} + +#[test] +fn multi_thread_mutex_contention_no_deadlock() { + use smarm::Mutex; + const ACTORS: usize = 20; + const PER: u64 = 100; + let total = Arc::new(AtomicU64::new(0)); + let t = total.clone(); + rt_par().run(move || { + let m: Mutex = Mutex::new(0); + let mut handles = Vec::new(); + for _ in 0..ACTORS { + let mc = m.clone(); + let tc = t.clone(); + handles.push(spawn(move || { + for _ in 0..PER { + let mut g = mc.lock_timeout(Duration::from_secs(5)).unwrap(); + *g += 1; + tc.fetch_add(0, Ordering::SeqCst); // just a memory barrier + } + })); + } + for h in handles { h.join().unwrap(); } + let g = m.lock_timeout(Duration::from_secs(1)).unwrap(); + t.store(*g, Ordering::SeqCst); + }); + assert_eq!(total.load(Ordering::SeqCst), ACTORS as u64 * PER); +} + +#[test] +fn multi_thread_join_across_threads() { + // Parent joins a child that may run on a different scheduler thread. + let v = Arc::new(AtomicU64::new(0)); + let vc = v.clone(); + rt_par().run(move || { + let h = spawn(move || { + // Do some work to make scheduling interesting. + for _ in 0..10 { yield_now(); } + vc.store(1, Ordering::SeqCst); + }); + h.join().unwrap(); + }); + assert_eq!(v.load(Ordering::SeqCst), 1); +} + +// --------------------------------------------------------------------------- +// Actors run on distinct OS threads +// +// We collect the OS thread IDs that actors execute on. With N schedulers +// and enough actors, we expect to see more than one thread ID. +// --------------------------------------------------------------------------- + +#[test] +fn actors_run_on_multiple_os_threads() { + let thread_ids: Arc>> = + Arc::new(smarm::Mutex::new(HashSet::new())); + + rt_par().run({ + let ids = thread_ids.clone(); + move || { + let mut handles = Vec::new(); + for _ in 0..64 { + let idc = ids.clone(); + handles.push(spawn(move || { + let tid = unsafe { libc::syscall(libc::SYS_gettid) as u64 }; + let mut g = idc.lock_timeout(Duration::from_secs(1)).unwrap(); + g.insert(tid); + })); + } + for h in handles { h.join().unwrap(); } + } + }); + + let n = std::thread::available_parallelism().map(|n| n.get()).unwrap_or(1); + + let ids = thread_ids.lock_timeout(Duration::from_secs(1)).unwrap(); + // If we have >1 scheduler threads, we expect >1 OS thread IDs. + // On a single-CPU machine this may be 1; we just assert ≥ 1. + assert!(!ids.is_empty()); + if n > 1 { + // Strongly expect parallelism — not a hard assert since scheduling + // is non-deterministic, but 64 actors should spread. + // We log rather than assert to avoid flakiness on loaded CI. + if ids.len() == 1 { + eprintln!("WARNING: 64 actors all ran on the same OS thread (flaky on loaded system)"); + } + } +} + +// --------------------------------------------------------------------------- +// Scheduler stats (RFC 000 Layer 1 primitives) +// --------------------------------------------------------------------------- + +#[test] +fn scheduler_stats_run_queue_len_is_observable() { + // After spawning actors but before they run, the queue should be non-empty. + // We can't observe this from inside run() without a snapshot API, but we + // can verify the stats struct is accessible and returns sane values after + // run() completes (queue len == 0 at quiescence). + let r = rt_par(); + r.run(|| { + for _ in 0..10 { spawn(|| {}); } + // Don't join — let them drain naturally. + }); + let stats = r.stats(); + assert_eq!(stats.total_run_queue_len(), 0, "queue should be empty after run()"); +} + +#[test] +fn scheduler_stats_thread_count_matches_config() { + let r = rt(3); + r.run(|| {}); + assert_eq!(r.stats().scheduler_count(), 3); +} + +// --------------------------------------------------------------------------- +// Panic isolation: a panicking actor doesn't kill the scheduler thread +// --------------------------------------------------------------------------- + +#[test] +fn panic_in_actor_does_not_kill_runtime() { + let completed = Arc::new(AtomicU64::new(0)); + let c = completed.clone(); + rt_par().run(move || { + // Spawn a panicker alongside well-behaved actors. + let bad = spawn(|| panic!("deliberate")); + let mut good_handles = Vec::new(); + for _ in 0..10 { + let cc = c.clone(); + good_handles.push(spawn(move || { + cc.fetch_add(1, Ordering::SeqCst); + })); + } + let _ = bad.join(); // expect Err + for h in good_handles { h.join().unwrap(); } + }); + assert_eq!(completed.load(Ordering::SeqCst), 10); +} + +// --------------------------------------------------------------------------- +// No slot leaks: rapid spawn/join churn +// --------------------------------------------------------------------------- + +#[test] +fn no_slot_leak_under_churn() { + // Spawn and join many short actors in a loop. If slots leak, the slot + // table grows unboundedly. We can't directly measure it without an + // introspection API, but the test at least checks correctness under + // churn and will OOM if there's a severe leak. + let counter = Arc::new(AtomicU64::new(0)); + let c = counter.clone(); + rt_par().run(move || { + for _ in 0..500 { + let cc = c.clone(); + spawn(move || { cc.fetch_add(1, Ordering::SeqCst); }) + .join() + .unwrap(); + } + }); + assert_eq!(counter.load(Ordering::SeqCst), 500); +} + +// --------------------------------------------------------------------------- +// Ping-pong: channel round-trips between two actors +// --------------------------------------------------------------------------- + +#[test] +fn ping_pong_completes() { + const ROUNDS: u64 = 1_000; + let final_val = Arc::new(AtomicU64::new(0)); + let fv = final_val.clone(); + rt_par().run(move || { + let (tx_a, rx_a) = channel::(); + let (tx_b, rx_b) = channel::(); + let h_a = spawn(move || { + tx_a.send(0).unwrap(); + for _ in 0..ROUNDS { + let v = rx_b.recv().unwrap(); + tx_a.send(v + 1).unwrap(); + } + }); + let h_b = spawn(move || { + for _ in 0..=ROUNDS { + let v = rx_a.recv().unwrap(); + if v < ROUNDS { + tx_b.send(v).unwrap(); + } else { + fv.store(v, Ordering::SeqCst); + } + } + }); + h_a.join().unwrap(); + h_b.join().unwrap(); + }); + assert_eq!(final_val.load(Ordering::SeqCst), ROUNDS); +} diff --git a/tests/scheduler.rs b/tests/scheduler.rs new file mode 100644 index 0000000..ed7a70d --- /dev/null +++ b/tests/scheduler.rs @@ -0,0 +1,171 @@ +//! End-to-end scheduler tests: spawning, joining, panic delivery, +//! yield_now, self_pid. + +use smarm::{channel, run, self_pid, spawn, spawn_under, yield_now, Signal}; +use std::cell::Cell; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::Arc; + +// --------------------------------------------------------------------------- +// Single root actor runs to completion +// --------------------------------------------------------------------------- + +#[test] +fn root_actor_runs() { + let captured = Arc::new(AtomicI64::new(0)); + let c = captured.clone(); + run(move || { c.store(99, Ordering::SeqCst); }); + assert_eq!(captured.load(Ordering::SeqCst), 99); +} + +// --------------------------------------------------------------------------- +// Spawn child, join it +// --------------------------------------------------------------------------- + +#[test] +fn spawn_and_join_returns_exit() { + let captured = Arc::new(AtomicI64::new(0)); + let c = captured.clone(); + run(move || { + let h = spawn(move || { c.store(7, Ordering::SeqCst); }); + let res = h.join(); + assert!(res.is_ok(), "join returned {:?}", res); + }); + assert_eq!(captured.load(Ordering::SeqCst), 7); +} + +// --------------------------------------------------------------------------- +// yield_now lets a sibling run +// --------------------------------------------------------------------------- + +#[test] +fn yield_now_interleaves_actors() { + let log: Arc>> = Arc::new(std::sync::Mutex::new(Vec::new())); + let l1 = log.clone(); + let l2 = log.clone(); + run(move || { + let h1 = spawn(move || { + l1.lock().unwrap().push(1); + yield_now(); + l1.lock().unwrap().push(3); + }); + let h2 = spawn(move || { + l2.lock().unwrap().push(2); + yield_now(); + l2.lock().unwrap().push(4); + }); + h1.join().unwrap(); + h2.join().unwrap(); + }); + // Both actors get their first step before either second step. Exact order + // is FIFO: 1, 2, then 3, 4. + assert_eq!(*log.lock().unwrap(), vec![1, 2, 3, 4]); +} + +// --------------------------------------------------------------------------- +// self_pid returns this actor's pid inside the actor +// --------------------------------------------------------------------------- + +#[test] +fn self_pid_is_stable_within_an_actor() { + let pid_cell: Arc>> = + Arc::new(std::sync::Mutex::new(None)); + let p2 = pid_cell.clone(); + run(move || { + let h = spawn(move || { + let me = self_pid(); + yield_now(); + assert_eq!(me, self_pid(), "self_pid changed across yield"); + *p2.lock().unwrap() = Some(me); + }); + h.join().unwrap(); + }); + assert!(pid_cell.lock().unwrap().is_some()); +} + +// --------------------------------------------------------------------------- +// Panic is captured; join returns Err; supervisor receives Signal::Panic +// --------------------------------------------------------------------------- + +#[test] +fn panicking_child_returns_join_error() { + let saw_err = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let s = saw_err.clone(); + run(move || { + let h = spawn(|| panic!("kaboom")); + if h.join().is_err() { + s.store(true, Ordering::SeqCst); + } + }); + + assert!(saw_err.load(Ordering::SeqCst)); +} + +#[test] +fn supervisor_receives_panic_signal() { + let saw_panic_signal = Arc::new(std::sync::atomic::AtomicBool::new(false)); + let s = saw_panic_signal.clone(); + + run(move || { + // Build a supervisor actor with its own mailbox. + let (sig_tx, sig_rx) = channel::(); + let sup_handle = spawn(move || { + // Wait for exactly one signal. + let sig = sig_rx.recv().unwrap(); + if let Signal::Panic(_, _) = sig { + s.store(true, Ordering::SeqCst); + } + }); + // Tell the runtime: when I spawn the next child, route signals here. + let sup_pid = sup_handle.pid(); + smarm::scheduler::register_supervisor_channel(sup_pid, sig_tx); + + let child = spawn_under(sup_pid, || panic!("oops")); + let _ = child.join(); + sup_handle.join().unwrap(); + }); + + assert!(saw_panic_signal.load(Ordering::SeqCst)); +} + +// --------------------------------------------------------------------------- +// Multiple children, all complete, parent gets back control +// --------------------------------------------------------------------------- + +#[test] +fn many_children_all_complete() { + let counter = Arc::new(AtomicI64::new(0)); + let c = counter.clone(); + run(move || { + let mut handles = Vec::new(); + for _ in 0..10 { + let cc = c.clone(); + handles.push(spawn(move || { + cc.fetch_add(1, Ordering::SeqCst); + })); + } + for h in handles { + h.join().unwrap(); + } + }); + assert_eq!(counter.load(Ordering::SeqCst), 10); +} + +// --------------------------------------------------------------------------- +// Repeated yield_now inside an actor with no other actors completes +// --------------------------------------------------------------------------- + +#[test] +fn yield_alone_terminates() { + thread_local! { + static N: Cell = const { Cell::new(0) }; + } + N.with(|c| c.set(0)); + run(|| { + for _ in 0..5 { + N.with(|c| c.set(c.get() + 1)); + yield_now(); + } + }); + assert_eq!(N.with(|c| c.get()), 5); +} diff --git a/tests/stack.rs b/tests/stack.rs new file mode 100644 index 0000000..cec741a --- /dev/null +++ b/tests/stack.rs @@ -0,0 +1,123 @@ +//! Stack allocator tests. +//! +//! Covers allocation, alignment, read/write across the usable region, and +//! (via subprocess) that the guard page actually SIGSEGVs. + +use smarm::stack::Stack; + +#[test] +fn top_is_16_byte_aligned() { + let s = Stack::new(64 * 1024).unwrap(); + assert_eq!(s.top() as usize % 16, 0); +} + +#[test] +fn top_is_within_allocation() { + let s = Stack::new(64 * 1024).unwrap(); + let top = s.top() as usize; + let base = s.usable_base() as usize; + assert!(top > base); + assert!(top <= base + s.stack_size()); +} + +#[test] +fn write_and_read_top_of_stack() { + let s = Stack::new(64 * 1024).unwrap(); + let sentinel: u64 = 0xDEAD_BEEF_CAFE_1234; + unsafe { + let ptr = s.top().sub(8) as *mut u64; + ptr.write_volatile(sentinel); + assert_eq!(ptr.read_volatile(), sentinel); + } +} + +#[test] +fn write_and_read_bottom_of_usable_region() { + let s = Stack::new(64 * 1024).unwrap(); + let sentinel: u64 = 0x0102_0304_0506_0708; + unsafe { + let ptr = s.usable_base() as *mut u64; + ptr.write_volatile(sentinel); + assert_eq!(ptr.read_volatile(), sentinel); + } +} + +#[test] +fn small_stack_allocates() { + assert!(Stack::new(4096).is_ok()); +} + +#[test] +fn large_stack_allocates() { + assert!(Stack::new(8 * 1024 * 1024).is_ok()); +} + +#[test] +fn stack_size_at_least_requested() { + let s = Stack::new(64 * 1024).unwrap(); + assert!(s.stack_size() >= 64 * 1024); +} + +// --------------------------------------------------------------------------- +// Guard page SIGSEGV tests — subprocess-based. +// --------------------------------------------------------------------------- + +use std::env; +use std::process::Command; + +fn run_as_child_if_requested() { + match env::var("SMARM_SUBTEST").as_deref() { + Ok("guard_page_direct") => { + let s = Stack::new(64 * 1024).unwrap(); + unsafe { + let guard_ptr = s.usable_base().sub(1); + guard_ptr.write_volatile(0xAB); + } + std::process::exit(0); + } + Ok("stack_overflow") => { + let s = Stack::new(64 * 1024).unwrap(); + unsafe { + let mut ptr = s.top().sub(1); + let stop = s.usable_base().sub(1); + while ptr >= stop { + ptr.write_volatile(0xFF); + ptr = ptr.sub(1); + } + } + std::process::exit(0); + } + _ => {} + } +} + +fn spawn_subtest(name: &str) -> std::process::ExitStatus { + let exe = env::current_exe().unwrap(); + Command::new(exe) + .env("SMARM_SUBTEST", name) + .args(["--test-threads=1", "--quiet"]) + .status() + .expect("failed to spawn subprocess") +} + +#[test] +fn guard_page_causes_sigsegv() { + run_as_child_if_requested(); + let status = spawn_subtest("guard_page_direct"); + #[cfg(unix)] + { + use std::os::unix::process::ExitStatusExt; + assert_eq!(status.signal(), Some(11), "expected SIGSEGV, got: {:?}", status); + } +} + +#[test] +fn stack_overflow_causes_sigsegv() { + run_as_child_if_requested(); + let status = spawn_subtest("stack_overflow"); + #[cfg(unix)] + { + use std::os::unix::process::ExitStatusExt; + assert_eq!(status.signal(), Some(11), "expected SIGSEGV, got: {:?}", status); + } +} diff --git a/tests/timer.rs b/tests/timer.rs new file mode 100644 index 0000000..2f94dd7 --- /dev/null +++ b/tests/timer.rs @@ -0,0 +1,207 @@ +//! Timer / sleep tests. These are time-sensitive and use generous +//! tolerances — we care about ordering and "didn't return instantly / +//! didn't take forever," not microsecond-precise scheduling. + +use smarm::{run, sleep, spawn}; +use std::sync::Arc; +use std::sync::Mutex; +use std::time::{Duration, Instant}; + +#[test] +fn sleep_returns_after_at_least_the_requested_duration() { + run(|| { + let t0 = Instant::now(); + sleep(Duration::from_millis(50)); + let elapsed = t0.elapsed(); + assert!( + elapsed >= Duration::from_millis(45), + "slept only {:?}, expected ≥ ~50ms", + elapsed + ); + // Loose upper bound — anything wildly slow indicates a bug. + assert!( + elapsed < Duration::from_millis(500), + "slept {:?}, far longer than the 50ms request", + elapsed + ); + }); +} + +#[test] +fn shorter_sleep_wakes_first() { + let log: Arc>> = Arc::new(Mutex::new(Vec::new())); + let l1 = log.clone(); + let l2 = log.clone(); + + run(move || { + let h1 = spawn(move || { + sleep(Duration::from_millis(60)); + l1.lock().unwrap().push(1); + }); + let h2 = spawn(move || { + sleep(Duration::from_millis(20)); + l2.lock().unwrap().push(2); + }); + h1.join().unwrap(); + h2.join().unwrap(); + }); + + // 2 (shorter sleep) wakes before 1. + assert_eq!(*log.lock().unwrap(), vec![2, 1]); +} + +#[test] +fn one_sleeping_actor_does_not_block_other_runnable_actors() { + let log: Arc>> = Arc::new(Mutex::new(Vec::new())); + let l1 = log.clone(); + let l2 = log.clone(); + + run(move || { + let h1 = spawn(move || { + sleep(Duration::from_millis(100)); + l1.lock().unwrap().push(1); + }); + let h2 = spawn(move || { + // Doesn't sleep. Should be able to run while h1 is parked. + for _ in 0..3 { + l2.lock().unwrap().push(2); + smarm::yield_now(); + } + }); + h2.join().unwrap(); + h1.join().unwrap(); + }); + + let v = log.lock().unwrap(); + // h2 finishes long before h1's 100ms timer. + let h2_count = v.iter().filter(|&&x| x == 2).count(); + let h1_pos = v.iter().position(|&x| x == 1); + assert_eq!(h2_count, 3); + // h1's push should land after h2 is fully done. + if let Some(p) = h1_pos { + assert!(p >= h2_count, "h1 woke before h2 finished: log = {:?}", *v); + } +} + +#[test] +fn zero_duration_sleep_yields_but_does_not_park_forever() { + // A zero-duration sleep should behave like yield_now: control returns + // promptly without hanging. + run(|| { + let t0 = Instant::now(); + sleep(Duration::from_millis(0)); + assert!(t0.elapsed() < Duration::from_millis(100)); + }); +} + +#[test] +fn many_concurrent_sleepers_all_wake() { + let counter = Arc::new(std::sync::atomic::AtomicU32::new(0)); + let c = counter.clone(); + run(move || { + let mut handles = Vec::new(); + for i in 0..20u64 { + let cc = c.clone(); + handles.push(spawn(move || { + // Stagger so they don't all coalesce to the same wake. + sleep(Duration::from_millis(5 + i * 2)); + cc.fetch_add(1, std::sync::atomic::Ordering::SeqCst); + })); + } + for h in handles { + h.join().unwrap(); + } + }); + assert_eq!(counter.load(std::sync::atomic::Ordering::SeqCst), 20); +} + +// --------------------------------------------------------------------------- +// Direct tests on the Timers data structure. No scheduler involved — these +// cover the new Reason machinery without needing a Mutex implementation. +// --------------------------------------------------------------------------- + +use smarm::pid::Pid; +use smarm::timer::{Reason, TimerTarget, Timers}; + +struct RecordingTarget { + calls: Mutex>, +} +impl TimerTarget for RecordingTarget { + fn on_timeout(&self, pid: Pid, seq: u64) { + self.calls.lock().unwrap().push((pid, seq)); + } +} + +#[test] +fn timers_pop_due_returns_entries_in_deadline_order() { + let mut t = Timers::new(); + let now = Instant::now(); + // Insert out of order; pop_due should hand them back sorted by deadline. + t.insert_sleep(now + Duration::from_millis(30), Pid::new(0, 0)); + t.insert_sleep(now + Duration::from_millis(10), Pid::new(1, 0)); + t.insert_sleep(now + Duration::from_millis(20), Pid::new(2, 0)); + + // Advance past all of them. + let due = t.pop_due(now + Duration::from_millis(50)); + let pids: Vec = due.iter().map(|e| e.pid.index()).collect(); + assert_eq!(pids, vec![1, 2, 0]); + assert!(t.is_empty()); +} + +#[test] +fn timers_only_pop_entries_whose_deadline_has_passed() { + let mut t = Timers::new(); + let now = Instant::now(); + t.insert_sleep(now + Duration::from_millis(5), Pid::new(0, 0)); + t.insert_sleep(now + Duration::from_millis(100), Pid::new(1, 0)); + + let due = t.pop_due(now + Duration::from_millis(20)); + assert_eq!(due.len(), 1); + assert_eq!(due[0].pid.index(), 0); + assert!(!t.is_empty()); + // The unpopped entry's deadline is still visible. + assert!(t.peek_deadline().is_some()); +} + +#[test] +fn timers_mix_sleep_and_wait_timeout_reasons() { + let mut t = Timers::new(); + let target = Arc::new(RecordingTarget { calls: Mutex::new(Vec::new()) }); + let now = Instant::now(); + + t.insert_sleep(now + Duration::from_millis(5), Pid::new(0, 0)); + t.insert( + now + Duration::from_millis(10), + Pid::new(1, 0), + Reason::WaitTimeout { target: target.clone(), wait_seq: 42 }, + ); + + let due = t.pop_due(now + Duration::from_millis(20)); + assert_eq!(due.len(), 2); + + // Order: Sleep (5ms) first, WaitTimeout (10ms) second. + match &due[0].reason { + Reason::Sleep => {} + _ => panic!("first entry should be a Sleep"), + } + match &due[1].reason { + Reason::WaitTimeout { wait_seq, .. } => assert_eq!(*wait_seq, 42), + _ => panic!("second entry should be a WaitTimeout"), + } +} + +#[test] +fn same_deadline_entries_pop_in_insertion_order() { + // The `seq` tiebreaker means inserting two entries with the same + // deadline preserves the order they were inserted. + let mut t = Timers::new(); + let now = Instant::now(); + let d = now + Duration::from_millis(10); + t.insert_sleep(d, Pid::new(0, 0)); + t.insert_sleep(d, Pid::new(1, 0)); + t.insert_sleep(d, Pid::new(2, 0)); + + let due = t.pop_due(now + Duration::from_millis(20)); + let pids: Vec = due.iter().map(|e| e.pid.index()).collect(); + assert_eq!(pids, vec![0, 1, 2]); +}