fix: stress testing & stability (v0.6.5)

Improve reliability under high load:
- tests/stress.rs: New comprehensive stress test suite (448 lines)
- Fine-tune I/O & runtime scheduling edge cases
- Pin versions & fix MSRV compatibility
This commit is contained in:
smarm
2026-05-24 07:03:45 +00:00
parent 978678a46e
commit aeacaf6118
4 changed files with 523 additions and 30 deletions

View File

@@ -427,9 +427,10 @@ fn epoll_loop(
continue;
}
let fd = ev.u64 as RawFd;
let evs = ev.events;
q.push_back(Completion::FdReady {
fd,
events: ev.events,
events: evs,
});
pushed_any = true;
}

View File

@@ -331,6 +331,34 @@ impl Runtime {
/// Run `f` as the initial actor, block until all actors finish.
/// Can be called multiple times sequentially on the same `Runtime`.
pub fn run(&self, f: impl FnOnce() + Send + 'static) {
// Install smarm's panic hook on first call. The default Rust hook is
// not reentrant — concurrent actor panics can trigger a double-panic
// abort when the backtrace printer takes an internal lock that is
// already held. smarm catches every actor panic via `catch_unwind` in
// the trampoline, so panics never need to reach the hook for runtime
// correctness; the hook fires only as a side-effect of unwinding before
// `catch_unwind` catches it.
//
// We install once and leave it installed: the previous hook is chained
// so that panics outside actor context (e.g. in the test harness
// itself) are still reported normally.
static HOOK_INSTALLED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
HOOK_INSTALLED.get_or_init(|| {
let prev = std::panic::take_hook();
std::panic::set_hook(Box::new(move |info| {
// If we are currently executing inside an actor trampoline the
// panic will be caught by `catch_unwind` momentarily. Suppress
// the hook output to avoid interleaved noise and reentrancy.
// Outside actor context, delegate to the previous hook so that
// genuine runtime panics are still reported.
if crate::actor::current_pid().is_some() {
// Inside an actor — catch_unwind handles it; stay silent.
} else {
prev(info);
}
}));
});
// Open the trace store for this run (no-op without smarm-trace).
#[cfg(feature = "smarm-trace")]
crate::trace::open();
@@ -560,10 +588,23 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
});
if let Some(pid) = parked_pid {
if let Some(slot) = s.slot_mut(pid) {
if matches!(slot.state, State::Parked) {
slot.state = State::Runnable;
s.run_queue.push_back(pid);
crate::te!(crate::trace::Event::Enqueue(pid));
match slot.state {
State::Parked => {
slot.state = State::Runnable;
s.run_queue.push_back(pid);
crate::te!(crate::trace::Event::UnparkDirect(pid));
crate::te!(crate::trace::Event::Enqueue(pid));
}
// Actor is between epoll_register
// and park_current. Set the flag so
// the upcoming Park yield re-queues
// instead of suspending. Mirrors
// scheduler::unpark().
State::Runnable => {
slot.pending_unpark = true;
crate::te!(crate::trace::Event::UnparkDeferred(pid));
}
State::Done => {}
}
}
}
@@ -586,8 +627,16 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
p
}
None => {
// Nothing runnable. Check whether we should wait or exit.
let (next_deadline, io_outstanding, wake_fd, queue_empty, live_actors) =
// Queue was empty when we popped. Re-examine under the lock to
// decide whether to exit or wait. All four conditions must hold
// simultaneously before we exit:
// 1. run queue is still empty
// 2. no live actors (nothing parked, nothing mid-finalize)
// 3. no pending timers
// 4. no outstanding IO
// If any is non-zero we keep spinning — "check the fridge is
// empty before you leave for the airport".
let (next_deadline, io_outstanding, wake_fd, all_clear) =
inner.with_shared(|s| {
let next = s.timers.peek_deadline();
let (out, fd) = match s.io.as_ref() {
@@ -597,21 +646,20 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
),
None => (0, None),
};
// Count actors that are not Done (Runnable or Parked).
let live = s.slots.iter().filter(|slot| {
slot.actor.is_some()
}).count();
(next, out, fd, s.run_queue.is_empty(), live)
let live = s.slots.iter().filter(|slot| slot.actor.is_some()).count();
let queue_empty = s.run_queue.is_empty();
let all_clear = queue_empty && live == 0 && next.is_none() && out == 0;
(next, out, fd, all_clear)
});
match (next_deadline, io_outstanding, wake_fd, queue_empty, live_actors) {
// Queue is now non-empty (another thread added work): retry.
(_, _, _, false, _) => continue,
// Truly idle — no timers, no IO, no live actors.
(None, 0, _, true, 0) => return,
// Live actors but queue empty: they must be parked on IO or
// timers. Wait on the appropriate source.
(Some(deadline), _, fd_opt, true, _) => {
if all_clear {
return;
}
// Something is still in flight. Sleep on the appropriate source
// to avoid hammering the mutex; the loop will retry on wake.
match (next_deadline, wake_fd) {
(Some(deadline), fd_opt) => {
let now = std::time::Instant::now();
if deadline > now {
let timeout = deadline - now;
@@ -623,22 +671,16 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
None => thread::sleep(timeout),
}
}
continue;
}
(None, _, Some(fd), true, _) => {
(None, Some(fd)) if io_outstanding > 0 => {
crate::io::poll_wake(fd, None);
crate::io::drain_wake_pipe(fd);
continue;
}
// Live actors, queue empty, no IO/timers: they're parked
// waiting for each other (potential deadlock in user code),
// or another thread is about to add work. Sleep briefly to
// avoid hammering the shared mutex.
_ => {
thread::sleep(std::time::Duration::from_micros(100));
continue;
}
}
continue;
}
};
@@ -649,7 +691,9 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
s.slot(pid).and_then(|slot| slot.actor.as_ref().map(|a| a.sp))
}) {
Some(sp) => sp,
None => continue, // stale pid
None => {
continue; // stale pid
}
};
// First resume: move the closure into the trampoline's thread-local.