fix(stress): port harnesses to v1.2 single-session API + remove WS-batch hang
Local API stress (lib.mjs / api_stress.mjs): - setupSession now does login -> /admin/api/reset and returns sid="main". Drops the dead /admin/api/quizzes + /admin/api/sessions calls left over from the multi-quiz codex era. - bootServer writes the fixture pool (STRESS_POOL by default) to a tmp file and passes QUIZ_POOL_PATH so the v1.2 server has a session at startup. - happyPath: drop the post-connect lobby_update wait (race with snapshot dispatch) and stop double-driving the lifecycle (next() already opens the next question, an explicit open() afterwards is a no-op). - cross_session: rewritten as "cookie not honored on a non-existent sid" since v1.2 hosts a single canonical session. Live accuracy stress (live_accuracy.mjs): - Per-student lobby-snapshot timeout (12s) with WS error/close rejection, so a stalled handshake no longer hangs Promise.all until the outer shell timeout (which produced the exit=124 cycles). - Open all student WSs in parallel (mirrors what real students do); the batch-of-8 throttle was masking the question we wanted answered. - Instructor WS open also bounded by a 15s race so any failure surfaces as actionable error text instead of a silent stall. Bootstrap (deploy/bootstrap.sh): - Stage 1 provisions a 2GB swap file (idempotent) with vm.swappiness=10. 1GB-RAM ECS instances OOM-kill uvicorn under WS-burst start-of-class pressure; swap absorbs the spike without affecting steady state. - Pool seeding prefers examples/demo10_pool.json over the 2-question example so a fresh deploy boots with a usable demo. Pool fixture (examples/demo10_pool.json): - 10-question generic-knowledge demo pool, gitignore exception added.
This commit is contained in:
@@ -88,9 +88,11 @@ async function joinStudent(sid, studentId, name) {
|
||||
|
||||
// Build a Student object: opens the WS, attaches the message listener
|
||||
// IMMEDIATELY (before connection establishes), so no incoming frame is
|
||||
// ever lost to a listener-attach race. Returns a Promise that resolves
|
||||
// to the bookkeeping struct once the lobby snapshot has arrived.
|
||||
function makeStudent(sid, cookie, idx) {
|
||||
// ever lost to a listener-attach race. Returns a Promise that settles
|
||||
// with {ok:true} when the lobby snapshot arrives, or {ok:false, err}
|
||||
// on WS error / close-before-lobby / per-student timeout. Stage-3 must
|
||||
// settle inside the timeout regardless of network glitches.
|
||||
function makeStudent(sid, cookie, idx, lobbyTimeoutMs) {
|
||||
const studentId = `S${String(idx).padStart(3, "0")}`;
|
||||
const ws = new WebSocket(`${wsBase}/ws/student/${SID}`, {
|
||||
headers: { Cookie: cookie },
|
||||
@@ -105,11 +107,25 @@ function makeStudent(sid, cookie, idx) {
|
||||
closedSeen: new Map(),
|
||||
ended: null,
|
||||
closed: false,
|
||||
lobbyErr: null,
|
||||
};
|
||||
let resolveLobby;
|
||||
const lobbyP = new Promise((r) => { resolveLobby = r; });
|
||||
ws.on("error", () => {});
|
||||
ws.on("close", () => { state.closed = true; });
|
||||
let settleLobby;
|
||||
let settled = false;
|
||||
const lobbyP = new Promise((r) => { settleLobby = r; });
|
||||
const settle = (val) => { if (!settled) { settled = true; settleLobby(val); } };
|
||||
const timer = setTimeout(() => {
|
||||
state.lobbyErr = `timeout after ${lobbyTimeoutMs}ms`;
|
||||
settle({ ok: false, err: state.lobbyErr });
|
||||
}, lobbyTimeoutMs);
|
||||
ws.on("error", (e) => {
|
||||
state.lobbyErr = `ws error: ${e?.message || e}`;
|
||||
settle({ ok: false, err: state.lobbyErr });
|
||||
});
|
||||
ws.on("close", () => {
|
||||
state.closed = true;
|
||||
state.lobbyErr ||= "ws closed before lobby";
|
||||
settle({ ok: false, err: state.lobbyErr });
|
||||
});
|
||||
ws.on("message", (raw) => {
|
||||
let m;
|
||||
try { m = JSON.parse(raw.toString()); } catch { return; }
|
||||
@@ -117,7 +133,8 @@ function makeStudent(sid, cookie, idx) {
|
||||
case "state":
|
||||
if (m.state === "lobby") {
|
||||
state.inLobby = true;
|
||||
resolveLobby();
|
||||
clearTimeout(timer);
|
||||
settle({ ok: true });
|
||||
}
|
||||
break;
|
||||
case "question_open":
|
||||
@@ -149,10 +166,13 @@ function openInstructorWS(adminCookie) {
|
||||
perMessageDeflate: false,
|
||||
});
|
||||
const ev = { ws, lastQuestionOpen: null };
|
||||
let resolveOpen;
|
||||
const openP = new Promise((r) => { resolveOpen = r; });
|
||||
ws.on("open", () => resolveOpen());
|
||||
ws.on("error", () => {});
|
||||
let settle;
|
||||
let settled = false;
|
||||
const openP = new Promise((r) => { settle = r; });
|
||||
const finish = (val) => { if (!settled) { settled = true; settle(val); } };
|
||||
ws.on("open", () => finish({ ok: true }));
|
||||
ws.on("error", (e) => finish({ ok: false, err: `instructor ws error: ${e?.message || e}` }));
|
||||
ws.on("close", () => finish({ ok: false, err: "instructor ws closed before open" }));
|
||||
ws.on("message", (raw) => {
|
||||
let m; try { m = JSON.parse(raw.toString()); } catch { return; }
|
||||
if (m.type === "question_open") ev.lastQuestionOpen = m;
|
||||
@@ -179,21 +199,38 @@ async function main() {
|
||||
if ((i + 1) % 10 === 0) process.stdout.write(` joined ${i + 1}/${N}\n`);
|
||||
}
|
||||
|
||||
console.log(`[stage 3] opening 1 admin + ${N} student WSs (batched)`);
|
||||
console.log(`[stage 3] opening 1 admin + ${N} student WSs (parallel)`);
|
||||
const inst = openInstructorWS(adminCookie);
|
||||
await inst.openP;
|
||||
const instRes = await Promise.race([
|
||||
inst.openP,
|
||||
sleep(15000).then(() => ({ ok: false, err: "instructor WS did not open within 15s" })),
|
||||
]);
|
||||
if (!instRes.ok) throw new Error(instRes.err);
|
||||
|
||||
// Open student WSs in batches of 8, 250ms apart.
|
||||
const students = [];
|
||||
const BATCH = 8, GAP_MS = 250;
|
||||
for (let i = 0; i < cookies.length; i += BATCH) {
|
||||
const slice = cookies.slice(i, i + BATCH);
|
||||
const wave = slice.map((c, j) => makeStudent(SID, c, i + j));
|
||||
await Promise.all(wave.map((s) => s.lobbyP));
|
||||
students.push(...wave.map((s) => s.state));
|
||||
if (i + BATCH < cookies.length) await sleep(GAP_MS);
|
||||
// Open all student WSs in parallel — mirrors what real students do
|
||||
// (no source-side throttle). Per-student lobby timeout = 12s; if any
|
||||
// students fail to lobby in time we PROCEED with the survivors and
|
||||
// log the failure so the cycle records actionable data instead of
|
||||
// hanging until the outer shell timeout.
|
||||
const LOBBY_TIMEOUT_MS = 12000;
|
||||
const wave = cookies.map((c, i) => makeStudent(SID, c, i, LOBBY_TIMEOUT_MS));
|
||||
const results = await Promise.all(wave.map((s) => s.lobbyP));
|
||||
const survivors = wave.filter((_, i) => results[i].ok).map((s) => s.state);
|
||||
const failed = results
|
||||
.map((r, i) => (!r.ok ? { idx: i, err: r.err } : null))
|
||||
.filter(Boolean);
|
||||
if (failed.length) {
|
||||
console.log(`[stage 3] partial — ${survivors.length}/${N} students lobbied within ${LOBBY_TIMEOUT_MS}ms`);
|
||||
failed.slice(0, 5).forEach((f) => console.log(` fail S${String(f.idx).padStart(3, "0")}: ${f.err}`));
|
||||
// Discard dead WSs cleanly so node doesn't keep them alive
|
||||
for (let i = 0; i < wave.length; i++) {
|
||||
if (!results[i].ok) { try { wave[i].state.ws.terminate(); } catch {} }
|
||||
}
|
||||
} else {
|
||||
console.log(`[stage 3] ok — all ${survivors.length} students saw the lobby snapshot`);
|
||||
}
|
||||
console.log(`[stage 3] ok — all ${students.length} students saw the lobby snapshot`);
|
||||
if (survivors.length === 0) throw new Error("no students lobbied; aborting cycle");
|
||||
const students = survivors;
|
||||
|
||||
// -- Drive each question ---
|
||||
console.log(`[stage 4] driving ${totalQs} questions via admin "next"`);
|
||||
|
||||
Reference in New Issue
Block a user