v0.1: green-thread actors, supervision, channels, benchmark

Hand-rolled context switching on mmap'd stacks with guard pages, allocator-driven RDTSC preemption, unbounded MPSC channels, supervision via per-slot Signal mailboxes, root supervisor as sentinel PID. Lib + tests + benches clean check/clippy. All 29 tests pass. Bench: smarm 3.4% over serial baseline, within 160us of tokio current-thread on prime-counting fan-out.
2026-05-22 05:01:51 +00:00
commit 0e9d9d7d5f
17 changed files with 1938 additions and 0 deletions
@@ -0,0 +1,110 @@
+//! Actor descriptor and trampoline.
+//!
+//! An `Actor` owns its stack and holds the closure it will run. The
+//! `trampoline` is a fixed `extern "C-unwind" fn()` that every actor enters
+//! through; it pulls the closure out of a thread-local set by the scheduler
+//! immediately before resume, invokes it inside `catch_unwind`, records the
+//! outcome, and switches back to the scheduler.
+//!
+//! Why a thread-local and not, say, passing the closure pointer via a
+//! register? Because the first resume goes through `ret`, not `call`, and
+//! we have no other channel for parameters. The scheduler sets the
+//! thread-local, switches in, the trampoline reads it. After the first
+//! resume the closure has been consumed, so subsequent resumes don't need it.
+
+use crate::context::switch_to_scheduler;
+use crate::pid::Pid;
+use crate::stack::Stack;
+use std::any::Any;
+use std::cell::{Cell, RefCell};
+use std::panic;
+
+/// What an actor produced when it finished. Stored on the actor's slot,
+/// drained by `JoinHandle::join` once the slot is marked done.
+pub enum Outcome {
+    Exit,
+    Panic(Box<dyn Any + Send>),
+}
+
+// Thread-locals that the scheduler writes immediately before `switch_to_actor`.
+thread_local! {
+    /// The closure for the actor we're about to resume *for the first time*.
+    /// Consumed on first entry into the trampoline; `None` thereafter.
+    static CURRENT_ACTOR_BOX: RefCell<Option<Box<dyn FnOnce() + Send>>> =
+        const { RefCell::new(None) };
+
+    /// The PID of the actor currently executing on this OS thread.
+    /// Set on every resume so that `self_pid()` works inside actor code.
+    static CURRENT_PID: Cell<Option<Pid>> = const { Cell::new(None) };
+
+    /// Filled by the trampoline when the actor returns (normally or via
+    /// panic). The scheduler reads this after `switch_to_actor` returns.
+    static LAST_OUTCOME: RefCell<Option<Outcome>> = const { RefCell::new(None) };
+
+    /// Set by the trampoline on completion; reset by the scheduler before
+    /// each resume so it never sees stale state.
+    static ACTOR_DONE: Cell<bool> = const { Cell::new(false) };
+}
+
+pub fn set_current_actor_box(b: Box<dyn FnOnce() + Send>) {
+    CURRENT_ACTOR_BOX.with(|c| *c.borrow_mut() = Some(b));
+}
+
+pub fn set_current_pid(p: Pid) {
+    CURRENT_PID.with(|c| c.set(Some(p)));
+}
+
+pub fn clear_current_pid() {
+    CURRENT_PID.with(|c| c.set(None));
+}
+
+pub fn current_pid() -> Option<Pid> {
+    CURRENT_PID.with(|c| c.get())
+}
+
+pub fn reset_actor_done() {
+    ACTOR_DONE.with(|c| c.set(false));
+}
+
+pub fn is_actor_done() -> bool {
+    ACTOR_DONE.with(|c| c.get())
+}
+
+pub fn take_last_outcome() -> Option<Outcome> {
+    LAST_OUTCOME.with(|r| r.borrow_mut().take())
+}
+
+/// The function whose address is written as the `ret` target on every actor
+/// stack. The compiler must not inline this away. `extern "C-unwind"` permits
+/// unwinding to cross the boundary, but `catch_unwind` here means unwinding
+/// never actually does.
+pub extern "C-unwind" fn trampoline() {
+    let b = CURRENT_ACTOR_BOX.with(|c| c.borrow_mut().take())
+        .expect("trampoline entered without a closure set");
+
+    let outcome = match panic::catch_unwind(panic::AssertUnwindSafe(b)) {
+        Ok(())     => Outcome::Exit,
+        Err(payload) => Outcome::Panic(payload),
+    };
+
+    LAST_OUTCOME.with(|r| *r.borrow_mut() = Some(outcome));
+    ACTOR_DONE.with(|c| c.set(true));
+
+    // Hand control back. The scheduler will tear down our slot and never
+    // resume us again.
+    unsafe { switch_to_scheduler() };
+    // Unreachable. If it isn't, the scheduler has a bug.
+    unreachable!("scheduler resumed a done actor");
+}
+
+/// One actor's worth of state. Owned by the scheduler's slot table.
+pub struct Actor {
+    /// The PID this actor was assigned at spawn time.
+    pub pid: Pid,
+    /// The stack the actor runs on. Dropped (munmap'd) when the actor dies.
+    pub stack: Stack,
+    /// The saved stack pointer. Updated on every yield.
+    pub sp: usize,
+    /// The PID of this actor's supervisor. Used to deliver `Signal` on death.
+    pub supervisor: Pid,
+}
@@ -0,0 +1,163 @@
+//! Unbounded MPSC channels.
+//!
+//! Single-threaded scheduler: the inner state is `Rc<RefCell<Inner<T>>>`,
+//! not `Arc<Mutex>`. We hand-implement `Send` for `Sender<T>` and
+//! `Receiver<T>` when `T: Send`, on the basis that the only way two actor
+//! contexts touch the same channel is by being scheduled on the *same* OS
+//! thread (v0.1 has exactly one). When we add a second scheduler thread,
+//! this lie must be retired: replace `Rc<RefCell>` with `Arc<Mutex>` (or a
+//! lock-free queue) and remove the unsafe Send impls.
+//!
+//! Semantics:
+//!   - Senders are clonable; the last sender drop closes the channel.
+//!   - `Receiver::recv` on an empty open channel parks the receiver.
+//!   - `Receiver::recv` on an empty closed channel returns `Err(RecvError)`.
+//!   - `Sender::send` on an open channel always succeeds.
+//!   - `Sender::send` on a closed channel (receiver dropped) returns
+//!     `Err(SendError(value))`.
+//!   - When a send pushes to a previously empty queue and a receiver is
+//!     parked, the receiver is unparked.
+
+use crate::pid::Pid;
+use std::cell::RefCell;
+use std::collections::VecDeque;
+use std::rc::Rc;
+
+pub fn channel<T>() -> (Sender<T>, Receiver<T>) {
+    let inner = Rc::new(RefCell::new(Inner {
+        queue: VecDeque::new(),
+        parked_receiver: None,
+        senders: 1,
+        receiver_alive: true,
+    }));
+    (Sender { inner: inner.clone() }, Receiver { inner })
+}
+
+struct Inner<T> {
+    queue: VecDeque<T>,
+    parked_receiver: Option<Pid>,
+    senders: usize,
+    receiver_alive: bool,
+}
+
+pub struct Sender<T> {
+    inner: Rc<RefCell<Inner<T>>>,
+}
+
+pub struct Receiver<T> {
+    inner: Rc<RefCell<Inner<T>>>,
+}
+
+// SAFETY (v0.1 only): the scheduler is single-threaded. Sender/Receiver can
+// be captured into actor closures (which require Send), but they will only
+// ever be touched from one OS thread. When multi-threading lands, swap the
+// `Rc<RefCell>` for `Arc<Mutex>` and remove these.
+unsafe impl<T: Send> Send for Sender<T> {}
+unsafe impl<T: Send> Send for Receiver<T> {}
+
+#[derive(Debug, PartialEq, Eq)]
+pub struct SendError<T>(pub T);
+
+#[derive(Debug, PartialEq, Eq, Clone, Copy)]
+pub struct RecvError;
+
+impl std::fmt::Display for RecvError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "channel closed")
+    }
+}
+
+impl std::error::Error for RecvError {}
+
+impl<T> Clone for Sender<T> {
+    fn clone(&self) -> Self {
+        self.inner.borrow_mut().senders += 1;
+        Sender { inner: self.inner.clone() }
+    }
+}
+
+impl<T> Drop for Sender<T> {
+    fn drop(&mut self) {
+        let unpark = {
+            let mut g = self.inner.borrow_mut();
+            g.senders -= 1;
+            if g.senders == 0 && g.queue.is_empty() {
+                // Channel closed and drained. Wake the receiver so it can
+                // see RecvError.
+                g.parked_receiver.take()
+            } else {
+                None
+            }
+        };
+        if let Some(pid) = unpark {
+            crate::scheduler::unpark(pid);
+        }
+    }
+}
+
+impl<T> Drop for Receiver<T> {
+    fn drop(&mut self) {
+        self.inner.borrow_mut().receiver_alive = false;
+    }
+}
+
+impl<T> Sender<T> {
+    pub fn send(&self, value: T) -> Result<(), SendError<T>> {
+        let unpark = {
+            let mut g = self.inner.borrow_mut();
+            if !g.receiver_alive {
+                return Err(SendError(value));
+            }
+            g.queue.push_back(value);
+            // If the receiver is parked, unpark it.
+            g.parked_receiver.take()
+        };
+        if let Some(pid) = unpark {
+            crate::scheduler::unpark(pid);
+        }
+        Ok(())
+    }
+}
+
+impl<T> Receiver<T> {
+    pub fn recv(&self) -> Result<T, RecvError> {
+        loop {
+            // Try to take a message.
+            {
+                let mut g = self.inner.borrow_mut();
+                if let Some(v) = g.queue.pop_front() {
+                    return Ok(v);
+                }
+                if g.senders == 0 {
+                    return Err(RecvError);
+                }
+                // Empty + open: register and park.
+                let me = crate::actor::current_pid()
+                    .expect("recv() called outside an actor");
+                debug_assert!(
+                    g.parked_receiver.is_none(),
+                    "channel has more than one receiver"
+                );
+                g.parked_receiver = Some(me);
+            }
+            // Release the borrow before parking — the unparker will need it.
+            crate::scheduler::park_current();
+            // Loop: the message that woke us might already have been taken
+            // (it can't, with one receiver, but the senders=0 path can fire
+            // here too).
+        }
+    }
+
+    /// Non-blocking. `Ok(Some(v))` if a message was available, `Ok(None)` if
+    /// the channel is empty but open, `Err(RecvError)` if closed and drained.
+    pub fn try_recv(&self) -> Result<Option<T>, RecvError> {
+        let mut g = self.inner.borrow_mut();
+        if let Some(v) = g.queue.pop_front() {
+            return Ok(Some(v));
+        }
+        if g.senders == 0 {
+            return Err(RecvError);
+        }
+        Ok(None)
+    }
+}
@@ -0,0 +1,106 @@
+//! Cooperative context switching, x86-64.
+//!
+//! Two naked-asm functions move execution between a scheduler thread and an
+//! actor running on its own mmap'd stack. The compiler cannot do this; the
+//! whole point of `#[unsafe(naked)]` is that we control every instruction.
+//!
+//! `SCHEDULER_SP` and `ACTOR_SP` are thread-locals holding each side's saved
+//! stack pointer. `init_actor_stack` builds the initial stack so that the
+//! first `switch_to_actor` lands inside the entry function with `rsp % 16 == 8`
+//! (the x86-64 ABI requirement at function entry).
+
+use std::cell::Cell;
+
+thread_local! {
+    static SCHEDULER_SP: Cell<usize> = const { Cell::new(0) };
+    static ACTOR_SP:     Cell<usize> = const { Cell::new(0) };
+}
+
+fn  get_scheduler_sp() -> usize     { SCHEDULER_SP.with(|c| c.get()) }
+fn  set_scheduler_sp(v: usize)      { SCHEDULER_SP.with(|c| c.set(v)) }
+pub fn get_actor_sp() -> usize      { ACTOR_SP.with(|c| c.get()) }
+pub fn set_actor_sp(v: usize)       { ACTOR_SP.with(|c| c.set(v)) }
+
+// ---------------------------------------------------------------------------
+// Initial stack layout
+//
+// After alignment, sp = top & ~15 - 8. Then we push (downward) six callee-
+// saved register slots and a return address. The first `switch_to_actor`
+// pops r15..rbx and `ret`s — landing in `entry` with rsp % 16 == 8.
+//
+// Layout (high → low), relative to aligned_top = top & ~15:
+//   aligned_top - 8  : entry ptr       ← `ret` target. Post-ret: rsp % 16 == 8.
+//   aligned_top - 16 : rbx = 0
+//   aligned_top - 24 : rbp = 0
+//   aligned_top - 32 : r12 = 0
+//   aligned_top - 40 : r13 = 0
+//   aligned_top - 48 : r14 = 0
+//   aligned_top - 56 : r15 = 0         ← initial rsp
+// ---------------------------------------------------------------------------
+
+pub fn init_actor_stack(top: *mut u8, entry: extern "C-unwind" fn()) -> usize {
+    unsafe {
+        let mut sp = (top as usize & !15) - 8;
+        sp -= 8; (sp as *mut usize).write(entry as usize); // ret target
+        sp -= 8; (sp as *mut usize).write(0); // rbx
+        sp -= 8; (sp as *mut usize).write(0); // rbp
+        sp -= 8; (sp as *mut usize).write(0); // r12
+        sp -= 8; (sp as *mut usize).write(0); // r13
+        sp -= 8; (sp as *mut usize).write(0); // r14
+        sp -= 8; (sp as *mut usize).write(0); // r15
+        sp
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Context switch shims
+//
+// Each shim:
+//   1. Pushes the six callee-saved integer registers.
+//   2. Snaps rsp into rdi and calls the Rust helper that stores it.
+//   3. Calls the Rust helper that returns the *other* side's saved rsp.
+//   4. Moves that into rsp.
+//   5. Pops the six registers and rets.
+//
+// XMM registers are NOT saved here. We rely on every yield happening through
+// a Rust call site, which means the compiler has spilled any live XMM state
+// to the stack before we get here. (This is the same argument the compiler
+// uses internally — callee-saved regs are what survive a `call`, and the
+// SysV AMD64 ABI says XMM0–15 are all caller-saved.) If we ever yield from
+// a place that isn't a Rust call boundary, this assumption breaks.
+// ---------------------------------------------------------------------------
+
+#[unsafe(naked)]
+unsafe extern "C" fn switch_to_actor_asm() {
+    core::arch::naked_asm!(
+        "push rbx", "push rbp", "push r12", "push r13", "push r14", "push r15",
+        "mov rdi, rsp",
+        "call {set_sched_sp}",
+        "call {get_actor_sp}",
+        "mov rsp, rax",
+        "pop r15", "pop r14", "pop r13", "pop r12", "pop rbp", "pop rbx",
+        "ret",
+        set_sched_sp = sym set_scheduler_sp,
+        get_actor_sp = sym get_actor_sp,
+    );
+}
+
+/// Resume the actor whose sp is in `ACTOR_SP`. Returns when the actor yields.
+pub unsafe fn switch_to_actor() {
+    unsafe { switch_to_actor_asm() };
+}
+
+#[unsafe(naked)]
+pub unsafe extern "C" fn switch_to_scheduler() {
+    core::arch::naked_asm!(
+        "push rbx", "push rbp", "push r12", "push r13", "push r14", "push r15",
+        "mov rdi, rsp",
+        "call {set_actor_sp}",
+        "call {get_sched_sp}",
+        "mov rsp, rax",
+        "pop r15", "pop r14", "pop r13", "pop r12", "pop rbp", "pop rbx",
+        "ret",
+        set_actor_sp = sym set_actor_sp,
+        get_sched_sp = sym get_scheduler_sp,
+    );
+}
@@ -0,0 +1,40 @@
+//! # smarm — Silly Marks Abstract Rust Machine
+//!
+//! Erlang-style green-thread actor concurrency for Rust.
+//!
+//! v0.1 is single-threaded. One scheduler, one OS thread. The scheduler
+//! cooperatively interleaves green-thread actors with hand-rolled context
+//! switches. Actors communicate by sending `Send` messages over channels;
+//! every actor has a supervisor, which is itself just an actor with a
+//! `Receiver<Signal>`.
+//!
+//! See `LOOM.md` for the design intent and the deferred-for-later list.
+
+pub mod stack;
+pub mod context;
+pub mod preempt;
+pub mod pid;
+pub mod actor;
+pub mod channel;
+pub mod scheduler;
+pub mod supervisor;
+
+// ---------------------------------------------------------------------------
+// Global allocator
+//
+// The preempting allocator wraps `System`. While `PREEMPTION_ENABLED` is
+// false (the default outside an actor) it adds one branch per allocation
+// and no syscalls. The scheduler flips it on per-resume.
+// ---------------------------------------------------------------------------
+
+#[global_allocator]
+static ALLOCATOR: preempt::PreemptingAllocator = preempt::PreemptingAllocator;
+
+// ---------------------------------------------------------------------------
+// Public API re-exports
+// ---------------------------------------------------------------------------
+
+pub use channel::{channel, Receiver, RecvError, Sender};
+pub use pid::Pid;
+pub use scheduler::{run, self_pid, spawn, spawn_under, yield_now, JoinError, JoinHandle};
+pub use supervisor::Signal;
@@ -0,0 +1,38 @@
+//! Process identifiers.
+//!
+//! A `Pid` is `(index, generation)`. The index is a slot in the scheduler's
+//! actor table; the generation increments every time that slot is reused.
+//! A stale `Pid` (correct index, wrong generation) is a detectable error,
+//! not a silent misdirection — solves the ABA problem without exhausting
+//! the PID space.
+
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Pid {
+    index: u32,
+    generation: u32,
+}
+
+impl Pid {
+    #[inline]
+    pub const fn new(index: u32, generation: u32) -> Self {
+        Self { index, generation }
+    }
+
+    #[inline]
+    pub const fn index(self) -> u32 { self.index }
+
+    #[inline]
+    pub const fn generation(self) -> u32 { self.generation }
+}
+
+impl std::fmt::Debug for Pid {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Pid({}.{})", self.index, self.generation)
+    }
+}
+
+impl std::fmt::Display for Pid {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "<{}.{}>", self.index, self.generation)
+    }
+}
@@ -0,0 +1,104 @@
+//! Allocator-driven preemption.
+//!
+//! A `GlobalAlloc` wrapper counts allocations. Every `ALLOC_INTERVAL`-th
+//! allocation it reads RDTSC and, if the actor's timeslice has expired,
+//! calls `switch_to_scheduler` to yield.
+//!
+//! All state is thread-local. The scheduler enables preemption on resume
+//! and disables it on the return path, so the scheduler can never preempt
+//! itself.
+//!
+//! TSC frequency is machine-dependent; `TIMESLICE_CYCLES` is a constant
+//! calibrated for ~100µs on a 3 GHz CPU. A real implementation would
+//! measure it at startup. For v0.1 the constant suffices.
+
+use std::alloc::{GlobalAlloc, Layout, System};
+use std::cell::Cell;
+
+const ALLOC_INTERVAL: u32 = 128;
+const TIMESLICE_CYCLES: u64 = 300_000; // ≈ 100µs on a 3 GHz CPU
+
+thread_local! {
+    /// While `false`, the allocator hook is a no-op.
+    pub static PREEMPTION_ENABLED: Cell<bool> = const { Cell::new(false) };
+
+    /// Countdown to next RDTSC check. Reset to `ALLOC_INTERVAL` on resume.
+    static ALLOC_COUNT: Cell<u32> = const { Cell::new(ALLOC_INTERVAL) };
+
+    /// RDTSC value written by the scheduler on every actor resume.
+    static TIMESLICE_START: Cell<u64> = const { Cell::new(0) };
+}
+
+/// Arm the timeslice. Called by the scheduler on every resume.
+pub fn reset_timeslice() {
+    ALLOC_COUNT.with(|c| c.set(ALLOC_INTERVAL));
+    TIMESLICE_START.with(|c| c.set(rdtsc()));
+}
+
+#[inline(always)]
+pub fn rdtsc() -> u64 {
+    unsafe {
+        // SAFETY: x86-64 only. `lfence` serialises the instruction stream so
+        // we don't measure time before prior instructions retire.
+        core::arch::asm!("lfence", options(nostack, nomem, preserves_flags));
+        core::arch::x86_64::_rdtsc()
+    }
+}
+
+pub struct PreemptingAllocator;
+
+unsafe impl GlobalAlloc for PreemptingAllocator {
+    #[inline]
+    unsafe fn alloc(&self, layout: Layout) -> *mut u8 {
+        maybe_preempt();
+        unsafe { System.alloc(layout) }
+    }
+
+    #[inline]
+    unsafe fn dealloc(&self, ptr: *mut u8, layout: Layout) {
+        unsafe { System.dealloc(ptr, layout) }
+    }
+
+    #[inline]
+    unsafe fn alloc_zeroed(&self, layout: Layout) -> *mut u8 {
+        maybe_preempt();
+        unsafe { System.alloc_zeroed(layout) }
+    }
+
+    #[inline]
+    unsafe fn realloc(&self, ptr: *mut u8, layout: Layout, new_size: usize) -> *mut u8 {
+        maybe_preempt();
+        unsafe { System.realloc(ptr, layout, new_size) }
+    }
+}
+
+#[inline(always)]
+fn maybe_preempt() {
+    ALLOC_COUNT.with(|c| {
+        let n = c.get();
+        if n == 0 {
+            c.set(ALLOC_INTERVAL);
+            if PREEMPTION_ENABLED.with(|e| e.get()) {
+                let start = TIMESLICE_START.with(|s| s.get());
+                if rdtsc().saturating_sub(start) > TIMESLICE_CYCLES {
+                    // SAFETY: reachable only inside an actor (the scheduler
+                    // sets PREEMPTION_ENABLED on resume and clears it on
+                    // return). The scheduler stack is therefore valid.
+                    unsafe { crate::context::switch_to_scheduler() };
+                }
+            }
+        } else {
+            c.set(n - 1);
+        }
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Test helpers
+// ---------------------------------------------------------------------------
+
+/// Force-expire the timeslice so the next RDTSC check preempts.
+pub fn expire_timeslice_for_test() {
+    TIMESLICE_START.with(|c| c.set(0));
+    ALLOC_COUNT.with(|c| c.set(0));
+}
@@ -0,0 +1,529 @@
+//! The single-threaded scheduler.
+//!
+//! There is one global scheduler per OS thread, stored in a thread-local.
+//! `run(initial)` initialises it, spawns the initial actor, drives the loop
+//! until the run queue is empty, then tears it down.
+//!
+//! Slot table: a `Vec<Slot>` indexed by `Pid::index()`, with a free list of
+//! reusable indices. Each slot has a `generation` counter that increments
+//! every time the slot is freed; `Pid` carries the generation it was minted
+//! with, so a stale PID has a mismatching generation and is detected on
+//! lookup.
+//!
+//! Run queue: a `VecDeque<Pid>` of runnable actors. The state of an actor
+//! is implicit in slot.state: `Runnable` means it's either in the queue or
+//! currently executing; `Parked` means it's waiting for something to unpark
+//! it (channel send, join completion, …); `Done` means it has finished and
+//! is awaiting reaping.
+//!
+//! Joining: `JoinHandle::join()` parks the calling actor and registers it
+//! on the target slot's `waiters` list. When the target actor finishes,
+//! the scheduler reaps the slot and unparks every waiter, passing them the
+//! outcome via a side channel (the target's `outcome` field, drained on
+//! the joiner side).
+
+use crate::actor::{
+    clear_current_pid, current_pid, is_actor_done, reset_actor_done,
+    set_current_actor_box, set_current_pid, take_last_outcome, trampoline, Actor, Outcome,
+};
+use crate::channel::Sender;
+use crate::context::{get_actor_sp, init_actor_stack, set_actor_sp, switch_to_actor};
+use crate::pid::Pid;
+use crate::preempt::PREEMPTION_ENABLED;
+use crate::stack::Stack;
+use crate::supervisor::Signal;
+use std::cell::RefCell;
+use std::collections::VecDeque;
+
+// ---------------------------------------------------------------------------
+// Configuration
+// ---------------------------------------------------------------------------
+
+const ACTOR_STACK_SIZE: usize = 64 * 1024;
+
+// ---------------------------------------------------------------------------
+// Per-actor slot
+// ---------------------------------------------------------------------------
+
+enum State {
+    /// Either in the run queue or currently executing.
+    Runnable,
+    /// Removed from the queue, waiting for `unpark()`.
+    Parked,
+    /// The actor has finished. Slot persists until the last `JoinHandle`
+    /// has been joined (or dropped). Then the slot is freed.
+    Done,
+}
+
+struct Slot {
+    /// Bumped every time this slot is freed and re-used. A `Pid` with a
+    /// non-matching generation is stale.
+    generation: u32,
+    /// `None` when the slot is free. `Some` otherwise.
+    actor: Option<Actor>,
+    state: State,
+    /// PIDs waiting in `JoinHandle::join`.
+    waiters: Vec<Pid>,
+    /// The outcome the actor produced, captured when it finished.
+    /// Drained by `JoinHandle::join`.
+    outcome: Option<Outcome>,
+    /// If this slot is a supervisor, the sender into its `Signal` mailbox.
+    /// Cloned out and used when one of its children dies.
+    supervisor_channel: Option<Sender<Signal>>,
+    /// Number of `JoinHandle`s still outstanding for this actor. The slot
+    /// is reclaimed only when the actor is done AND outstanding_handles == 0.
+    outstanding_handles: u32,
+}
+
+impl Slot {
+    fn vacant() -> Self {
+        Self {
+            generation: 0,
+            actor: None,
+            state: State::Done,
+            waiters: Vec::new(),
+            outcome: None,
+            supervisor_channel: None,
+            outstanding_handles: 0,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Scheduler state
+// ---------------------------------------------------------------------------
+
+struct SchedulerState {
+    slots: Vec<Slot>,
+    free_list: Vec<u32>,
+    run_queue: VecDeque<Pid>,
+    /// The root supervisor's PID. Children spawned at the top level are
+    /// supervised by this. Set by `run()`.
+    root_pid: Option<Pid>,
+}
+
+impl SchedulerState {
+    fn new() -> Self {
+        Self {
+            slots: Vec::new(),
+            free_list: Vec::new(),
+            run_queue: VecDeque::new(),
+            root_pid: None,
+        }
+    }
+
+    /// Allocate a slot; return its (index, generation).
+    fn allocate_slot(&mut self) -> (u32, u32) {
+        if let Some(idx) = self.free_list.pop() {
+            let s = &mut self.slots[idx as usize];
+            (idx, s.generation)
+        } else {
+            let idx = self.slots.len() as u32;
+            self.slots.push(Slot::vacant());
+            (idx, 0)
+        }
+    }
+
+    fn slot(&self, pid: Pid) -> Option<&Slot> {
+        let s = self.slots.get(pid.index() as usize)?;
+        if s.generation == pid.generation() { Some(s) } else { None }
+    }
+
+    fn slot_mut(&mut self, pid: Pid) -> Option<&mut Slot> {
+        let s = self.slots.get_mut(pid.index() as usize)?;
+        if s.generation == pid.generation() { Some(s) } else { None }
+    }
+}
+
+thread_local! {
+    static SCHED: RefCell<Option<SchedulerState>> = const { RefCell::new(None) };
+}
+
+fn with_sched<R>(f: impl FnOnce(&mut SchedulerState) -> R) -> R {
+    SCHED.with(|c| {
+        let mut g = c.borrow_mut();
+        let s = g.as_mut().expect("scheduler not running");
+        f(s)
+    })
+}
+
+/// Same as `with_sched` but returns `None` when there's no scheduler instead
+/// of panicking. Used on cleanup paths (channel sender drop during shutdown,
+/// for example).
+fn try_with_sched<R>(f: impl FnOnce(&mut SchedulerState) -> R) -> Option<R> {
+    SCHED.with(|c| {
+        let mut g = c.borrow_mut();
+        g.as_mut().map(f)
+    })
+}
+
+// ---------------------------------------------------------------------------
+// JoinHandle
+// ---------------------------------------------------------------------------
+
+#[derive(Debug)]
+pub struct JoinError {
+    /// Whatever `panic!` was called with.
+    pub payload: Box<dyn std::any::Any + Send>,
+}
+
+pub struct JoinHandle {
+    pid: Pid,
+    /// `false` once `join()` has been called and the handle has consumed
+    /// its outcome. Prevents the Drop impl from double-decrementing.
+    consumed: bool,
+}
+
+impl JoinHandle {
+    pub fn pid(&self) -> Pid { self.pid }
+
+    /// Block the calling actor until the target completes. Returns
+    /// `Ok(())` on normal exit, `Err(JoinError)` if the target panicked.
+    pub fn join(mut self) -> Result<(), JoinError> {
+        let me = current_pid().expect("join() called outside an actor");
+
+        loop {
+            let outcome = with_sched(|s| {
+                let slot = s.slot_mut(self.pid)
+                    .expect("join: target slot has been reused");
+                if matches!(slot.state, State::Done) {
+                    Some(slot.outcome.take().expect("Done slot must have an outcome"))
+                } else {
+                    slot.waiters.push(me);
+                    None
+                }
+            });
+
+            match outcome {
+                Some(o) => {
+                    self.consumed = true;
+                    self.decrement_handle_count();
+                    return match o {
+                        Outcome::Exit => Ok(()),
+                        Outcome::Panic(p) => Err(JoinError { payload: p }),
+                    };
+                }
+                None => park_current(),
+            }
+        }
+    }
+
+    fn decrement_handle_count(&mut self) {
+        with_sched(|s| {
+            let should_reclaim = match s.slot_mut(self.pid) {
+                Some(slot) => {
+                    slot.outstanding_handles = slot.outstanding_handles.saturating_sub(1);
+                    matches!(slot.state, State::Done) && slot.outstanding_handles == 0
+                }
+                None => false,
+            };
+            if should_reclaim {
+                reclaim_slot(s, self.pid);
+            }
+        });
+    }
+}
+
+impl Drop for JoinHandle {
+    fn drop(&mut self) {
+        if !self.consumed {
+            self.decrement_handle_count();
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Slot reclamation
+// ---------------------------------------------------------------------------
+
+fn reclaim_slot(s: &mut SchedulerState, pid: Pid) {
+    let idx = pid.index();
+    let slot = &mut s.slots[idx as usize];
+    // Bump generation so any stale PIDs from now on miss.
+    slot.generation = slot.generation.wrapping_add(1);
+    // Drop the actor (its stack with it).
+    slot.actor = None;
+    slot.outcome = None;
+    slot.waiters.clear();
+    slot.supervisor_channel = None;
+    slot.state = State::Done; // semantically vacant; allocator checks free_list
+    slot.outstanding_handles = 0;
+    s.free_list.push(idx);
+}
+
+// ---------------------------------------------------------------------------
+// spawn / spawn_under / self_pid
+// ---------------------------------------------------------------------------
+
+/// Spawn `f` as a child of the currently-executing actor.
+/// Outside an actor (only legal from `run()`'s initial setup), the child's
+/// supervisor is the root supervisor.
+pub fn spawn(f: impl FnOnce() + Send + 'static) -> JoinHandle {
+    let parent = current_pid()
+        .or_else(|| with_sched(|s| s.root_pid))
+        .expect("spawn() before run()");
+    spawn_under(parent, f)
+}
+
+/// Spawn `f` with `supervisor` as its parent. The supervisor will receive
+/// a `Signal` on its registered channel when the child terminates.
+pub fn spawn_under(supervisor: Pid, f: impl FnOnce() + Send + 'static) -> JoinHandle {
+    let pid = with_sched(|s| {
+        let (idx, gen) = s.allocate_slot();
+        let pid = Pid::new(idx, gen);
+        let stack = Stack::new(ACTOR_STACK_SIZE)
+            .expect("stack allocation failed");
+        let sp = init_actor_stack(stack.top(), trampoline);
+        let slot = &mut s.slots[idx as usize];
+        slot.actor = Some(Actor { pid, stack, sp, supervisor });
+        slot.state = State::Runnable;
+        slot.outstanding_handles = 1;
+        slot.outcome = None;
+        slot.waiters.clear();
+        slot.supervisor_channel = None;
+        s.run_queue.push_back(pid);
+        pid
+    });
+
+    // Stash the closure where `schedule_loop` will find it before the first
+    // resume.
+    PENDING_CLOSURES.with(|c| {
+        c.borrow_mut().push((pid, Box::new(f) as Closure));
+    });
+
+    JoinHandle { pid, consumed: false }
+}
+
+type Closure = Box<dyn FnOnce() + Send>;
+
+thread_local! {
+    /// Closures awaiting their first resume. Keyed by the PID the scheduler
+    /// allocated for them in `spawn_under`. The scheduler pops from here in
+    /// `pop_pending_closure` right before each first resume.
+    static PENDING_CLOSURES: RefCell<Vec<(Pid, Closure)>> = const { RefCell::new(Vec::new()) };
+}
+
+fn pop_pending_closure(pid: Pid) -> Option<Closure> {
+    PENDING_CLOSURES.with(|c| {
+        let mut v = c.borrow_mut();
+        v.iter().position(|(p, _)| *p == pid).map(|i| v.swap_remove(i).1)
+    })
+}
+
+pub fn self_pid() -> Pid {
+    current_pid().expect("self_pid() called outside an actor")
+}
+
+// ---------------------------------------------------------------------------
+// yield_now / park / unpark
+// ---------------------------------------------------------------------------
+
+/// Cooperative yield. The current actor goes to the back of the run queue.
+pub fn yield_now() {
+    // Mark ourselves as needing to be re-queued, then yield.
+    YIELD_INTENT.with(|c| c.set(YieldIntent::Yield));
+    unsafe { crate::context::switch_to_scheduler() };
+}
+
+/// Park the current actor (remove it from the run queue until `unpark`).
+pub fn park_current() {
+    YIELD_INTENT.with(|c| c.set(YieldIntent::Park));
+    unsafe { crate::context::switch_to_scheduler() };
+}
+
+/// Wake a parked actor. If the actor isn't parked (already runnable or done)
+/// this is a no-op — that's important; channel and join can both fire
+/// spurious unparks under some orderings and we want them to be cheap.
+/// Also a no-op if the scheduler isn't running (covers channel-sender drop
+/// during runtime teardown).
+pub fn unpark(pid: Pid) {
+    try_with_sched(|s| {
+        if let Some(slot) = s.slot_mut(pid) {
+            if matches!(slot.state, State::Parked) {
+                slot.state = State::Runnable;
+                s.run_queue.push_back(pid);
+            }
+        }
+    });
+}
+
+/// What an actor wants the scheduler to do when control returns from it.
+#[derive(Copy, Clone)]
+enum YieldIntent {
+    /// Re-queue (yield_now or preemption).
+    Yield,
+    /// Remove from the run queue (waiting for unpark).
+    Park,
+}
+
+thread_local! {
+    static YIELD_INTENT: std::cell::Cell<YieldIntent> = const { std::cell::Cell::new(YieldIntent::Yield) };
+}
+
+// ---------------------------------------------------------------------------
+// Supervisor channel registration
+// ---------------------------------------------------------------------------
+
+/// Register `sender` as the mailbox for signals about children supervised
+/// by `pid`. Idempotent; later calls overwrite.
+pub fn register_supervisor_channel(pid: Pid, sender: Sender<Signal>) {
+    with_sched(|s| {
+        if let Some(slot) = s.slot_mut(pid) {
+            slot.supervisor_channel = Some(sender);
+        } else {
+            panic!("register_supervisor_channel: pid {:?} not found", pid);
+        }
+    });
+}
+
+// ---------------------------------------------------------------------------
+// run() — the runtime entry point
+// ---------------------------------------------------------------------------
+
+/// Boot the runtime, spawn `initial` as a child of the root supervisor,
+/// drive the scheduler until the run queue is empty, tear down.
+///
+/// The root supervisor is a *sentinel* PID, not a real actor. Signals
+/// addressed to it are dropped on the floor — that's what "process exits"
+/// means in the spec when nothing escalates further. User code that wants
+/// real supervision spawns its own supervisor actor and uses `spawn_under`.
+pub fn run<F: FnOnce() + Send + 'static>(initial: F) {
+    SCHED.with(|c| {
+        assert!(c.borrow().is_none(), "smarm::run() called recursively");
+        let mut state = SchedulerState::new();
+        state.root_pid = Some(ROOT_PID);
+        *c.borrow_mut() = Some(state);
+    });
+
+    let initial_handle = spawn(initial);
+
+    schedule_loop();
+
+    // Drop the handle BEFORE the scheduler is torn down — its Drop impl
+    // calls `with_sched` to decrement the outstanding-handle count.
+    drop(initial_handle);
+
+    // Take the SchedulerState out of the thread-local BEFORE dropping it.
+    // Dropping it while still inside SCHED.with's RefCell borrow would
+    // re-enter (via channel senders' Drop → unpark → try_with_sched).
+    let state = SCHED.with(|c| c.borrow_mut().take());
+    drop(state);
+    PENDING_CLOSURES.with(|c| c.borrow_mut().clear());
+}
+
+/// Reserved sentinel pid for the root supervisor. Never allocated to a
+/// real actor; lookups return `None`; signals are dropped.
+pub const ROOT_PID: Pid = Pid::new(u32::MAX, u32::MAX);
+
+fn schedule_loop() {
+    loop {
+        let pid = match with_sched(|s| s.run_queue.pop_front()) {
+            Some(p) => p,
+            None => return,
+        };
+
+        // Look up sp; skip stale or already-reaped pids.
+        let sp = match with_sched(|s| {
+            s.slot(pid).and_then(|slot| slot.actor.as_ref().map(|a| a.sp))
+        }) {
+            Some(sp) => sp,
+            None => continue,
+        };
+
+        // If this is a first resume, move the pending closure to the
+        // thread-local the trampoline reads.
+        if let Some(b) = pop_pending_closure(pid) {
+            set_current_actor_box(b);
+        }
+
+        set_actor_sp(sp);
+        set_current_pid(pid);
+        reset_actor_done();
+        YIELD_INTENT.with(|c| c.set(YieldIntent::Yield));
+
+        crate::preempt::reset_timeslice();
+        PREEMPTION_ENABLED.with(|c| c.set(true));
+
+        unsafe { switch_to_actor() };
+
+        PREEMPTION_ENABLED.with(|c| c.set(false));
+        clear_current_pid();
+
+        let intent = YIELD_INTENT.with(|c| c.get());
+        let new_sp = get_actor_sp();
+
+        if is_actor_done() {
+            let outcome = take_last_outcome().unwrap_or(Outcome::Exit);
+            finalize_actor(pid, outcome);
+        } else {
+            with_sched(|s| {
+                if let Some(slot) = s.slot_mut(pid) {
+                    if let Some(actor) = slot.actor.as_mut() {
+                        actor.sp = new_sp;
+                    }
+                    match intent {
+                        YieldIntent::Yield => {
+                            slot.state = State::Runnable;
+                            s.run_queue.push_back(pid);
+                        }
+                        YieldIntent::Park => {
+                            slot.state = State::Parked;
+                        }
+                    }
+                }
+            });
+        }
+    }
+}
+
+fn finalize_actor(pid: Pid, outcome: Outcome) {
+    // Joiners get the typed Result with the panic payload. The supervisor
+    // gets an informational `Signal::Panic` with an empty payload — its job
+    // is policy (restart/escalate), not forensics. Users who need the
+    // payload in supervision can plumb their own channel.
+
+    let (joiner_outcome, sup_signal) = match outcome {
+        Outcome::Exit             => (Outcome::Exit, Signal::Exit(pid)),
+        Outcome::Panic(payload)   => (
+            Outcome::Panic(payload),
+            Signal::Panic(pid, Box::new(()) as Box<dyn std::any::Any + Send>),
+        ),
+    };
+
+    // Stash outcome, mark Done, collect waiters, drop the actor stack.
+    let (waiters, supervisor_pid) = with_sched(|s| {
+        let slot = s.slot_mut(pid).expect("finalize_actor: slot vanished");
+        let sup = slot.actor.as_ref().map(|a| a.supervisor);
+        slot.outcome = Some(joiner_outcome);
+        slot.state = State::Done;
+        slot.actor = None;
+        let w = std::mem::take(&mut slot.waiters);
+        (w, sup)
+    });
+
+    // Deliver to supervisor (best-effort; ignore SendError).
+    if let Some(sup) = supervisor_pid {
+        let sender = with_sched(|s| {
+            s.slot(sup).and_then(|slot| slot.supervisor_channel.clone())
+        });
+        if let Some(sender) = sender {
+            let _ = sender.send(sup_signal);
+        }
+    }
+
+    // Unpark joiners.
+    for joiner in waiters {
+        unpark(joiner);
+    }
+
+    // Reclaim if no outstanding handles.
+    with_sched(|s| {
+        let should_reclaim = match s.slot(pid) {
+            Some(slot) => slot.outstanding_handles == 0,
+            None => false,
+        };
+        if should_reclaim {
+            reclaim_slot(s, pid);
+        }
+    });
+}
@@ -0,0 +1,89 @@
+//! mmap-based growable stack with a guard page below.
+//!
+//! Layout (low → high address):
+//!   [ guard page (PROT_NONE) | stack region ]
+//!                                            ^ top() — initial stack pointer
+//!
+//! Stacks grow downward. Overflow lands in the guard page → SIGSEGV.
+
+use std::io;
+
+pub struct Stack {
+    /// Bottom of the entire mmap'd region (start of guard page).
+    base: *mut u8,
+    /// Total mmap'd size: guard_size + stack_size.
+    total_size: usize,
+    /// Usable stack size (excluding guard page).
+    stack_size: usize,
+}
+
+// Stack owns its memory; safe to send across threads.
+unsafe impl Send for Stack {}
+
+impl Stack {
+    /// Allocate a new stack. `stack_size` is the usable region; one page is
+    /// added below as a guard page. Both are rounded up to the page size.
+    pub fn new(stack_size: usize) -> io::Result<Self> {
+        let page = page_size();
+        let stack_size = round_up(stack_size, page);
+        let guard_size = page;
+        let total_size = guard_size + stack_size;
+
+        let base = unsafe {
+            libc::mmap(
+                std::ptr::null_mut(),
+                total_size,
+                libc::PROT_READ | libc::PROT_WRITE,
+                libc::MAP_PRIVATE | libc::MAP_ANONYMOUS,
+                -1,
+                0,
+            )
+        };
+        if base == libc::MAP_FAILED {
+            return Err(io::Error::last_os_error());
+        }
+        let base = base as *mut u8;
+
+        let ret = unsafe {
+            libc::mprotect(base as *mut libc::c_void, guard_size, libc::PROT_NONE)
+        };
+        if ret != 0 {
+            let err = io::Error::last_os_error();
+            unsafe { libc::munmap(base as *mut libc::c_void, total_size) };
+            return Err(err);
+        }
+
+        Ok(Self { base, total_size, stack_size })
+    }
+
+    /// 16-byte-aligned top of the usable region.
+    pub fn top(&self) -> *mut u8 {
+        let raw_top = self.base as usize + self.total_size;
+        (raw_top & !15) as *mut u8
+    }
+
+    /// Pointer to the bottom of the usable region (just above the guard page).
+    pub fn usable_base(&self) -> *mut u8 {
+        unsafe { self.base.add(page_size()) }
+    }
+
+    pub fn stack_size(&self) -> usize {
+        self.stack_size
+    }
+}
+
+impl Drop for Stack {
+    fn drop(&mut self) {
+        unsafe {
+            libc::munmap(self.base as *mut libc::c_void, self.total_size);
+        }
+    }
+}
+
+fn page_size() -> usize {
+    unsafe { libc::sysconf(libc::_SC_PAGESIZE) as usize }
+}
+
+fn round_up(n: usize, align: usize) -> usize {
+    (n + align - 1) & !(align - 1)
+}
@@ -0,0 +1,37 @@
+//! Supervision signals.
+//!
+//! Every actor has a supervisor, which is itself just an actor with a
+//! `Receiver<Signal>`. When a child actor terminates, the scheduler sends
+//! a `Signal` on the supervisor's channel. The supervisor decides what to
+//! do — restart, escalate, ignore.
+//!
+//! For v0.1 there is no built-in restart-intensity cap. That's policy and
+//! lives in user code; library is mechanism only.
+
+use crate::pid::Pid;
+use std::any::Any;
+
+pub enum Signal {
+    /// The child exited normally.
+    Exit(Pid),
+    /// The child panicked. Payload is whatever `panic!` was called with.
+    Panic(Pid, Box<dyn Any + Send>),
+}
+
+impl std::fmt::Debug for Signal {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            Signal::Exit(pid)     => write!(f, "Signal::Exit({:?})", pid),
+            Signal::Panic(pid, _) => write!(f, "Signal::Panic({:?}, ..)", pid),
+        }
+    }
+}
+
+impl Signal {
+    pub fn pid(&self) -> Pid {
+        match self {
+            Signal::Exit(p)     => *p,
+            Signal::Panic(p, _) => *p,
+        }
+    }
+}