feat: I/O and mutex support (v0.3)
Add epoll-based non-blocking I/O and kernel-like mutexes: - src/io.rs: Complete epoll backend with timeout & error handling - src/mutex.rs: Fair mutex with waiter queues & parking integration - Enhanced scheduler to support synchronous I/O blocking - Comprehensive test suites for I/O (epoll) and mutex behavior - Documentation: LOOM.md concurrency model & README
This commit is contained in:
237
src/io.rs
Normal file
237
src/io.rs
Normal file
@@ -0,0 +1,237 @@
|
||||
//! Off-scheduler blocking work.
|
||||
//!
|
||||
//! `block_on_io(closure)` runs `closure` on a dedicated worker OS thread,
|
||||
//! parks the calling actor in the meantime, and returns the closure's
|
||||
//! value when it completes. Lets actors call into blocking C libraries,
|
||||
//! synchronous file IO, or anything else that would otherwise stall the
|
||||
//! scheduler thread.
|
||||
//!
|
||||
//! Architecture
|
||||
//! ============
|
||||
//! Per `run()`:
|
||||
//! - one worker OS thread, started by `run()` and joined at shutdown;
|
||||
//! - a request channel (`mpsc::Sender<Request>`) from scheduler → worker;
|
||||
//! - a completion queue (`Mutex<VecDeque<Completion>>`) worker → scheduler;
|
||||
//! - a wake pipe: when the worker pushes a completion it writes one byte
|
||||
//! to the pipe; the scheduler polls the pipe (with timeout) when it
|
||||
//! would otherwise be idle.
|
||||
//!
|
||||
//! For v0.2 the worker is a single thread, so concurrent `block_on_io`
|
||||
//! calls are serialised. v0.3 can replace it with a thread pool behind
|
||||
//! the same request channel.
|
||||
//!
|
||||
//! Panic handling
|
||||
//! ==============
|
||||
//! The worker runs the closure inside `catch_unwind` and ships either the
|
||||
//! return value or the panic payload back to the scheduler. `block_on_io`
|
||||
//! resumes the panic on the calling actor's stack, so the actor's
|
||||
//! supervisor sees a real `Signal::Panic` as if the work had run inline.
|
||||
|
||||
use crate::pid::Pid;
|
||||
use std::any::Any;
|
||||
use std::collections::VecDeque;
|
||||
use std::io;
|
||||
use std::os::fd::RawFd;
|
||||
use std::panic;
|
||||
use std::sync::mpsc;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::thread::JoinHandle as OsJoinHandle;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Wire types
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
/// What the worker stores while computing a result. `Ok` is the closure's
|
||||
/// return value (boxed as `Any`); `Err` is the panic payload.
|
||||
pub type IoResult = Result<Box<dyn Any + Send>, Box<dyn Any + Send>>;
|
||||
|
||||
struct Request {
|
||||
pid: Pid,
|
||||
/// The work to perform. Returns the wire-form result directly.
|
||||
work: Box<dyn FnOnce() -> IoResult + Send>,
|
||||
}
|
||||
|
||||
struct Completion {
|
||||
pid: Pid,
|
||||
result: IoResult,
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// IoThread — created per `run()`, owned by `SchedulerState`.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
pub struct IoThread {
|
||||
/// Channel into the worker.
|
||||
tx: mpsc::Sender<Request>,
|
||||
/// Shared completion queue. The worker pushes; the scheduler drains.
|
||||
completions: Arc<Mutex<VecDeque<Completion>>>,
|
||||
/// Pipe used as a one-bit wakeup. `wake_read` is what the scheduler
|
||||
/// polls; `wake_write` is what the worker writes to.
|
||||
wake_read: RawFd,
|
||||
wake_write: RawFd,
|
||||
/// Worker thread handle, joined on shutdown.
|
||||
worker: Option<OsJoinHandle<()>>,
|
||||
/// Number of requests in-flight (sent but not yet drained as a
|
||||
/// completion). Used by the scheduler's idle path to decide whether
|
||||
/// to wait on the pipe or exit.
|
||||
pub outstanding: u32,
|
||||
}
|
||||
|
||||
impl IoThread {
|
||||
pub fn start() -> io::Result<Self> {
|
||||
let (wake_read, wake_write) = make_pipe()?;
|
||||
let (tx, rx) = mpsc::channel::<Request>();
|
||||
let completions: Arc<Mutex<VecDeque<Completion>>> =
|
||||
Arc::new(Mutex::new(VecDeque::new()));
|
||||
|
||||
let comps_worker = completions.clone();
|
||||
let worker = std::thread::Builder::new()
|
||||
.name("smarm-io".into())
|
||||
.spawn(move || worker_loop(rx, comps_worker, wake_write))?;
|
||||
|
||||
Ok(Self {
|
||||
tx,
|
||||
completions,
|
||||
wake_read,
|
||||
wake_write,
|
||||
worker: Some(worker),
|
||||
outstanding: 0,
|
||||
})
|
||||
}
|
||||
|
||||
/// Hand a request to the worker. Increments `outstanding`.
|
||||
pub fn submit(&mut self, pid: Pid, work: Box<dyn FnOnce() -> IoResult + Send>) {
|
||||
self.outstanding += 1;
|
||||
// Send can only fail if the worker has hung up, which only happens
|
||||
// on shutdown. submit during shutdown is a bug.
|
||||
self.tx
|
||||
.send(Request { pid, work })
|
||||
.expect("io worker hung up unexpectedly");
|
||||
}
|
||||
|
||||
/// Drain every available completion. Caller is responsible for
|
||||
/// decrementing `outstanding` and routing the results.
|
||||
pub fn drain_completions(&mut self) -> Vec<(Pid, IoResult)> {
|
||||
let mut q = self.completions.lock().unwrap();
|
||||
let mut out = Vec::with_capacity(q.len());
|
||||
while let Some(c) = q.pop_front() {
|
||||
out.push((c.pid, c.result));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
pub fn wake_fd(&self) -> RawFd {
|
||||
self.wake_read
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for IoThread {
|
||||
fn drop(&mut self) {
|
||||
// Hang up the request channel; the worker will exit its loop.
|
||||
// We must drop `tx` before joining. Take it out by moving.
|
||||
// mpsc::Sender doesn't have explicit `disconnect`; dropping it
|
||||
// (after this scope) causes the receiver to return Err.
|
||||
//
|
||||
// Trick: replace self.tx with a fresh dead one so we can drop it.
|
||||
let (dead_tx, _) = mpsc::channel::<Request>();
|
||||
let real_tx = std::mem::replace(&mut self.tx, dead_tx);
|
||||
drop(real_tx);
|
||||
|
||||
if let Some(h) = self.worker.take() {
|
||||
// Best-effort join. If the worker panicked, ignore.
|
||||
let _ = h.join();
|
||||
}
|
||||
|
||||
// Close the pipe.
|
||||
unsafe {
|
||||
libc::close(self.wake_read);
|
||||
libc::close(self.wake_write);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Worker loop
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn worker_loop(
|
||||
rx: mpsc::Receiver<Request>,
|
||||
completions: Arc<Mutex<VecDeque<Completion>>>,
|
||||
wake_write: RawFd,
|
||||
) {
|
||||
while let Ok(Request { pid, work }) = rx.recv() {
|
||||
let result: IoResult = match panic::catch_unwind(panic::AssertUnwindSafe(work)) {
|
||||
Ok(r) => r,
|
||||
Err(payload) => Err(payload),
|
||||
};
|
||||
completions.lock().unwrap().push_back(Completion { pid, result });
|
||||
// Write one byte to the pipe to wake the scheduler. If the pipe
|
||||
// buffer is full (scheduler isn't draining), the write may return
|
||||
// EAGAIN — we'll ignore it because there's already an outstanding
|
||||
// wakeup that hasn't been consumed yet.
|
||||
let buf: [u8; 1] = [0];
|
||||
unsafe {
|
||||
// EINTR is the only retryable case worth handling.
|
||||
loop {
|
||||
let n = libc::write(wake_write, buf.as_ptr() as *const _, 1);
|
||||
if n < 0 {
|
||||
let e = *libc::__errno_location();
|
||||
if e == libc::EINTR { continue; }
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Pipe helpers
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
fn make_pipe() -> io::Result<(RawFd, RawFd)> {
|
||||
let mut fds: [libc::c_int; 2] = [0; 2];
|
||||
// O_CLOEXEC so children don't inherit, O_NONBLOCK on the read side
|
||||
// so the scheduler's drain can `read` without blocking.
|
||||
let r = unsafe {
|
||||
libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC | libc::O_NONBLOCK)
|
||||
};
|
||||
if r != 0 {
|
||||
return Err(io::Error::last_os_error());
|
||||
}
|
||||
Ok((fds[0], fds[1]))
|
||||
}
|
||||
|
||||
/// Drain pending bytes from the wake pipe. The scheduler calls this after
|
||||
/// a `poll` wakeup so the next idle call sees an empty pipe.
|
||||
pub fn drain_wake_pipe(fd: RawFd) {
|
||||
let mut buf = [0u8; 64];
|
||||
loop {
|
||||
let n = unsafe { libc::read(fd, buf.as_mut_ptr() as *mut _, buf.len()) };
|
||||
if n <= 0 {
|
||||
// EAGAIN (would block) or EOF — done.
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Block on `fd` for up to `timeout`, returning when either there's data
|
||||
/// to read or the timeout elapses. `None` for `timeout` means wait forever.
|
||||
pub fn poll_wake(fd: RawFd, timeout: Option<std::time::Duration>) {
|
||||
let timeout_ms: libc::c_int = match timeout {
|
||||
None => -1,
|
||||
Some(d) => {
|
||||
// Cap at i32::MAX milliseconds; poll's argument is c_int.
|
||||
let ms = d.as_millis();
|
||||
if ms > i32::MAX as u128 { i32::MAX } else { ms as i32 }
|
||||
}
|
||||
};
|
||||
let mut pfd = libc::pollfd { fd, events: libc::POLLIN, revents: 0 };
|
||||
loop {
|
||||
let r = unsafe { libc::poll(&mut pfd as *mut _, 1, timeout_ms) };
|
||||
if r < 0 {
|
||||
let e = unsafe { *libc::__errno_location() };
|
||||
if e == libc::EINTR { continue; }
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
@@ -19,6 +19,7 @@ pub mod channel;
|
||||
pub mod scheduler;
|
||||
pub mod supervisor;
|
||||
pub mod timer;
|
||||
pub mod io;
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Global allocator
|
||||
@@ -37,5 +38,7 @@ static ALLOCATOR: preempt::PreemptingAllocator = preempt::PreemptingAllocator;
|
||||
|
||||
pub use channel::{channel, Receiver, RecvError, Sender};
|
||||
pub use pid::Pid;
|
||||
pub use scheduler::{run, self_pid, sleep, spawn, spawn_under, yield_now, JoinError, JoinHandle};
|
||||
pub use scheduler::{
|
||||
block_on_io, run, self_pid, sleep, spawn, spawn_under, yield_now, JoinError, JoinHandle,
|
||||
};
|
||||
pub use supervisor::Signal;
|
||||
|
||||
130
src/scheduler.rs
130
src/scheduler.rs
@@ -73,6 +73,10 @@ struct Slot {
|
||||
/// Number of `JoinHandle`s still outstanding for this actor. The slot
|
||||
/// is reclaimed only when the actor is done AND outstanding_handles == 0.
|
||||
outstanding_handles: u32,
|
||||
/// One-shot mailbox for the result of an in-flight `block_on_io` call.
|
||||
/// The scheduler writes it on completion; `block_on_io` reads it on
|
||||
/// resume.
|
||||
pending_io_result: Option<crate::io::IoResult>,
|
||||
}
|
||||
|
||||
impl Slot {
|
||||
@@ -85,6 +89,7 @@ impl Slot {
|
||||
outcome: None,
|
||||
supervisor_channel: None,
|
||||
outstanding_handles: 0,
|
||||
pending_io_result: None,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -102,6 +107,8 @@ struct SchedulerState {
|
||||
root_pid: Option<Pid>,
|
||||
/// Pending sleep timers. Min-heap keyed by deadline.
|
||||
timers: crate::timer::Timers,
|
||||
/// IO worker thread. `None` outside `run()`.
|
||||
io: Option<crate::io::IoThread>,
|
||||
}
|
||||
|
||||
impl SchedulerState {
|
||||
@@ -112,6 +119,7 @@ impl SchedulerState {
|
||||
run_queue: VecDeque::new(),
|
||||
root_pid: None,
|
||||
timers: crate::timer::Timers::new(),
|
||||
io: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -251,6 +259,7 @@ fn reclaim_slot(s: &mut SchedulerState, pid: Pid) {
|
||||
slot.supervisor_channel = None;
|
||||
slot.state = State::Done; // semantically vacant; allocator checks free_list
|
||||
slot.outstanding_handles = 0;
|
||||
slot.pending_io_result = None;
|
||||
s.free_list.push(idx);
|
||||
}
|
||||
|
||||
@@ -284,6 +293,7 @@ pub fn spawn_under(supervisor: Pid, f: impl FnOnce() + Send + 'static) -> JoinHa
|
||||
slot.outcome = None;
|
||||
slot.waiters.clear();
|
||||
slot.supervisor_channel = None;
|
||||
slot.pending_io_result = None;
|
||||
s.run_queue.push_back(pid);
|
||||
pid
|
||||
});
|
||||
@@ -344,6 +354,53 @@ pub fn sleep(duration: std::time::Duration) {
|
||||
park_current();
|
||||
}
|
||||
|
||||
/// Run `f` on the IO worker thread, park the current actor while it runs,
|
||||
/// and return `f`'s value when it completes. Panics inside `f` propagate
|
||||
/// to the calling actor.
|
||||
///
|
||||
/// Use this for blocking calls that would otherwise stall the scheduler —
|
||||
/// synchronous file IO, blocking C FFI, libpq, etc.
|
||||
pub fn block_on_io<F, T>(f: F) -> T
|
||||
where
|
||||
F: FnOnce() -> T + Send + 'static,
|
||||
T: Send + 'static,
|
||||
{
|
||||
let me = current_pid().expect("block_on_io() called outside an actor");
|
||||
|
||||
// Box the user closure into the wire-form result-shaped closure that
|
||||
// the worker expects. The worker also wraps in catch_unwind, but doing
|
||||
// it here too would let us downcast `T` only when the closure didn't
|
||||
// panic. We let the worker handle catch_unwind so the boxing here
|
||||
// stays straightforward.
|
||||
let work: Box<dyn FnOnce() -> crate::io::IoResult + Send> = Box::new(move || {
|
||||
let v: T = f();
|
||||
Ok(Box::new(v) as Box<dyn std::any::Any + Send>)
|
||||
});
|
||||
|
||||
with_sched(|s| {
|
||||
let io = s.io.as_mut().expect("io thread not started");
|
||||
io.submit(me, work);
|
||||
});
|
||||
|
||||
park_current();
|
||||
|
||||
// On resume, our slot has a result waiting.
|
||||
let result = with_sched(|s| {
|
||||
s.slot_mut(me)
|
||||
.expect("block_on_io: own slot vanished")
|
||||
.pending_io_result
|
||||
.take()
|
||||
.expect("block_on_io: resumed without a result")
|
||||
});
|
||||
|
||||
match result {
|
||||
Ok(any) => *any
|
||||
.downcast::<T>()
|
||||
.expect("block_on_io: result type mismatch — should be unreachable"),
|
||||
Err(payload) => std::panic::resume_unwind(payload),
|
||||
}
|
||||
}
|
||||
|
||||
/// Wake a parked actor. If the actor isn't parked (already runnable or done)
|
||||
/// this is a no-op — that's important; channel and join can both fire
|
||||
/// spurious unparks under some orderings and we want them to be cheap.
|
||||
@@ -405,6 +462,7 @@ pub fn run<F: FnOnce() + Send + 'static>(initial: F) {
|
||||
assert!(c.borrow().is_none(), "smarm::run() called recursively");
|
||||
let mut state = SchedulerState::new();
|
||||
state.root_pid = Some(ROOT_PID);
|
||||
state.io = Some(crate::io::IoThread::start().expect("failed to start io thread"));
|
||||
*c.borrow_mut() = Some(state);
|
||||
});
|
||||
|
||||
@@ -445,24 +503,76 @@ fn schedule_loop() {
|
||||
});
|
||||
}
|
||||
|
||||
// 2. Pop a runnable actor. If none, sleep on the soonest timer or
|
||||
// exit if there isn't one.
|
||||
// 2. Drain IO completions: route each result to its slot and
|
||||
// unpark the actor. Drain even when we have other runnables —
|
||||
// it's cheap (a try_lock of the completion queue) and keeps
|
||||
// pending_io_result freshness bounded.
|
||||
let completions = with_sched(|s| {
|
||||
s.io.as_mut().map(|io| io.drain_completions()).unwrap_or_default()
|
||||
});
|
||||
for (pid, result) in completions {
|
||||
with_sched(|s| {
|
||||
if let Some(io) = s.io.as_mut() {
|
||||
io.outstanding = io.outstanding.saturating_sub(1);
|
||||
}
|
||||
if let Some(slot) = s.slot_mut(pid) {
|
||||
slot.pending_io_result = Some(result);
|
||||
if matches!(slot.state, State::Parked) {
|
||||
slot.state = State::Runnable;
|
||||
s.run_queue.push_back(pid);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
// 3. Pop a runnable actor. If none, decide whether to block on
|
||||
// the wake pipe (for timers or IO) or exit (nothing pending).
|
||||
let pid = match with_sched(|s| s.run_queue.pop_front()) {
|
||||
Some(p) => p,
|
||||
None => {
|
||||
let next = with_sched(|s| s.timers.peek_deadline());
|
||||
match next {
|
||||
Some(deadline) => {
|
||||
// Read out what we'd need to block on. We must take the
|
||||
// wake fd separately because we can't hold an SCHED
|
||||
// borrow across `poll_wake` — the IO thread will be
|
||||
// trying to take the completions mutex, which is fine,
|
||||
// but the scheduler thread itself mustn't hold SCHED
|
||||
// borrowed across a blocking syscall.
|
||||
let (next_deadline, io_outstanding, wake_fd) = with_sched(|s| {
|
||||
let next = s.timers.peek_deadline();
|
||||
let (out, fd) = match s.io.as_ref() {
|
||||
Some(io) => (io.outstanding, Some(io.wake_fd())),
|
||||
None => (0, None),
|
||||
};
|
||||
(next, out, fd)
|
||||
});
|
||||
|
||||
match (next_deadline, io_outstanding, wake_fd) {
|
||||
// Nothing pending — we're done.
|
||||
(None, 0, _) | (None, _, None) => return,
|
||||
// Timer pending, nothing else: poll with a deadline,
|
||||
// or fall back to plain sleep if we somehow have no
|
||||
// wake fd (shouldn't happen — io thread is always up
|
||||
// during run()).
|
||||
(Some(deadline), _, fd_opt) => {
|
||||
let now = std::time::Instant::now();
|
||||
if deadline > now {
|
||||
// No other thread can wake us; plain sleep is
|
||||
// correct. When the IO thread lands in v0.2
|
||||
// this becomes a Condvar / pipe wakeup.
|
||||
std::thread::sleep(deadline - now);
|
||||
let timeout = deadline - now;
|
||||
match fd_opt {
|
||||
Some(fd) => {
|
||||
crate::io::poll_wake(fd, Some(timeout));
|
||||
crate::io::drain_wake_pipe(fd);
|
||||
}
|
||||
None => std::thread::sleep(timeout),
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
None => return, // no runnables, no timers — done.
|
||||
// No timer, but IO outstanding: poll forever for the
|
||||
// pipe wakeup.
|
||||
(None, _, Some(fd)) => {
|
||||
crate::io::poll_wake(fd, None);
|
||||
crate::io::drain_wake_pipe(fd);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user