fix: stress testing & stability (v0.6.5)
Improve reliability under high load: - tests/stress.rs: New comprehensive stress test suite (448 lines) - Fine-tune I/O & runtime scheduling edge cases - Pin versions & fix MSRV compatibility
This commit is contained in:
@@ -427,9 +427,10 @@ fn epoll_loop(
|
||||
continue;
|
||||
}
|
||||
let fd = ev.u64 as RawFd;
|
||||
let evs = ev.events;
|
||||
q.push_back(Completion::FdReady {
|
||||
fd,
|
||||
events: ev.events,
|
||||
events: evs,
|
||||
});
|
||||
pushed_any = true;
|
||||
}
|
||||
|
||||
100
src/runtime.rs
100
src/runtime.rs
@@ -331,6 +331,34 @@ impl Runtime {
|
||||
/// Run `f` as the initial actor, block until all actors finish.
|
||||
/// Can be called multiple times sequentially on the same `Runtime`.
|
||||
pub fn run(&self, f: impl FnOnce() + Send + 'static) {
|
||||
// Install smarm's panic hook on first call. The default Rust hook is
|
||||
// not reentrant — concurrent actor panics can trigger a double-panic
|
||||
// abort when the backtrace printer takes an internal lock that is
|
||||
// already held. smarm catches every actor panic via `catch_unwind` in
|
||||
// the trampoline, so panics never need to reach the hook for runtime
|
||||
// correctness; the hook fires only as a side-effect of unwinding before
|
||||
// `catch_unwind` catches it.
|
||||
//
|
||||
// We install once and leave it installed: the previous hook is chained
|
||||
// so that panics outside actor context (e.g. in the test harness
|
||||
// itself) are still reported normally.
|
||||
static HOOK_INSTALLED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
|
||||
HOOK_INSTALLED.get_or_init(|| {
|
||||
let prev = std::panic::take_hook();
|
||||
std::panic::set_hook(Box::new(move |info| {
|
||||
// If we are currently executing inside an actor trampoline the
|
||||
// panic will be caught by `catch_unwind` momentarily. Suppress
|
||||
// the hook output to avoid interleaved noise and reentrancy.
|
||||
// Outside actor context, delegate to the previous hook so that
|
||||
// genuine runtime panics are still reported.
|
||||
if crate::actor::current_pid().is_some() {
|
||||
// Inside an actor — catch_unwind handles it; stay silent.
|
||||
} else {
|
||||
prev(info);
|
||||
}
|
||||
}));
|
||||
});
|
||||
|
||||
// Open the trace store for this run (no-op without smarm-trace).
|
||||
#[cfg(feature = "smarm-trace")]
|
||||
crate::trace::open();
|
||||
@@ -560,10 +588,23 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
|
||||
});
|
||||
if let Some(pid) = parked_pid {
|
||||
if let Some(slot) = s.slot_mut(pid) {
|
||||
if matches!(slot.state, State::Parked) {
|
||||
slot.state = State::Runnable;
|
||||
s.run_queue.push_back(pid);
|
||||
crate::te!(crate::trace::Event::Enqueue(pid));
|
||||
match slot.state {
|
||||
State::Parked => {
|
||||
slot.state = State::Runnable;
|
||||
s.run_queue.push_back(pid);
|
||||
crate::te!(crate::trace::Event::UnparkDirect(pid));
|
||||
crate::te!(crate::trace::Event::Enqueue(pid));
|
||||
}
|
||||
// Actor is between epoll_register
|
||||
// and park_current. Set the flag so
|
||||
// the upcoming Park yield re-queues
|
||||
// instead of suspending. Mirrors
|
||||
// scheduler::unpark().
|
||||
State::Runnable => {
|
||||
slot.pending_unpark = true;
|
||||
crate::te!(crate::trace::Event::UnparkDeferred(pid));
|
||||
}
|
||||
State::Done => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -586,8 +627,16 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
|
||||
p
|
||||
}
|
||||
None => {
|
||||
// Nothing runnable. Check whether we should wait or exit.
|
||||
let (next_deadline, io_outstanding, wake_fd, queue_empty, live_actors) =
|
||||
// Queue was empty when we popped. Re-examine under the lock to
|
||||
// decide whether to exit or wait. All four conditions must hold
|
||||
// simultaneously before we exit:
|
||||
// 1. run queue is still empty
|
||||
// 2. no live actors (nothing parked, nothing mid-finalize)
|
||||
// 3. no pending timers
|
||||
// 4. no outstanding IO
|
||||
// If any is non-zero we keep spinning — "check the fridge is
|
||||
// empty before you leave for the airport".
|
||||
let (next_deadline, io_outstanding, wake_fd, all_clear) =
|
||||
inner.with_shared(|s| {
|
||||
let next = s.timers.peek_deadline();
|
||||
let (out, fd) = match s.io.as_ref() {
|
||||
@@ -597,21 +646,20 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
|
||||
),
|
||||
None => (0, None),
|
||||
};
|
||||
// Count actors that are not Done (Runnable or Parked).
|
||||
let live = s.slots.iter().filter(|slot| {
|
||||
slot.actor.is_some()
|
||||
}).count();
|
||||
(next, out, fd, s.run_queue.is_empty(), live)
|
||||
let live = s.slots.iter().filter(|slot| slot.actor.is_some()).count();
|
||||
let queue_empty = s.run_queue.is_empty();
|
||||
let all_clear = queue_empty && live == 0 && next.is_none() && out == 0;
|
||||
(next, out, fd, all_clear)
|
||||
});
|
||||
|
||||
match (next_deadline, io_outstanding, wake_fd, queue_empty, live_actors) {
|
||||
// Queue is now non-empty (another thread added work): retry.
|
||||
(_, _, _, false, _) => continue,
|
||||
// Truly idle — no timers, no IO, no live actors.
|
||||
(None, 0, _, true, 0) => return,
|
||||
// Live actors but queue empty: they must be parked on IO or
|
||||
// timers. Wait on the appropriate source.
|
||||
(Some(deadline), _, fd_opt, true, _) => {
|
||||
if all_clear {
|
||||
return;
|
||||
}
|
||||
|
||||
// Something is still in flight. Sleep on the appropriate source
|
||||
// to avoid hammering the mutex; the loop will retry on wake.
|
||||
match (next_deadline, wake_fd) {
|
||||
(Some(deadline), fd_opt) => {
|
||||
let now = std::time::Instant::now();
|
||||
if deadline > now {
|
||||
let timeout = deadline - now;
|
||||
@@ -623,22 +671,16 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
|
||||
None => thread::sleep(timeout),
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
(None, _, Some(fd), true, _) => {
|
||||
(None, Some(fd)) if io_outstanding > 0 => {
|
||||
crate::io::poll_wake(fd, None);
|
||||
crate::io::drain_wake_pipe(fd);
|
||||
continue;
|
||||
}
|
||||
// Live actors, queue empty, no IO/timers: they're parked
|
||||
// waiting for each other (potential deadlock in user code),
|
||||
// or another thread is about to add work. Sleep briefly to
|
||||
// avoid hammering the shared mutex.
|
||||
_ => {
|
||||
thread::sleep(std::time::Duration::from_micros(100));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
@@ -649,7 +691,9 @@ fn schedule_loop(inner: &Arc<RuntimeInner>, slot: usize) {
|
||||
s.slot(pid).and_then(|slot| slot.actor.as_ref().map(|a| a.sp))
|
||||
}) {
|
||||
Some(sp) => sp,
|
||||
None => continue, // stale pid
|
||||
None => {
|
||||
continue; // stale pid
|
||||
}
|
||||
};
|
||||
|
||||
// First resume: move the closure into the trampoline's thread-local.
|
||||
|
||||
Reference in New Issue
Block a user