From 994c5481bfd0978ed31db093eface90b427e9a43 Mon Sep 17 00:00:00 2001 From: admin Date: Wed, 6 May 2026 16:51:12 +0000 Subject: [PATCH] fix: crash loop after reboot - resilient error handlers + mask user service Root causes: 1. uncaughtException/unhandledRejection called gracefulShutdown() -> process.exit(0) Any minor error killed the entire bot. Changed to LOG ONLY (Hermes/OpenCode pattern). 2. User-level systemd service was running alongside system-level, fighting for port 3001. Masked user service permanently. 3. Fragile new Promise(() => {}) keepalive replaced with setInterval-based keepalive. 4. Syntax error in uncaughtException handler (literal newline in single-quoted string). Tested: 5 rapid consecutive restarts all pass. Uptime stable. Co-Authored-By: zcode --- CHANGELOG.md | 13 ++++++++++++- src/bot/index.js | 27 ++++++++++++++++++++------- 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ffddbf8a..aa26dfbe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -43,9 +43,20 @@ Changes: 6. **Full Hermes guardrail integration in tool execution loop** — beforeCall checks, afterCall failure tracking, guidance appended to results +### 🐛 Fixed +- **Crash loop after reboot** — `uncaughtException` and `unhandledRejection` handlers were calling + `gracefulShutdown()` (which calls `process.exit(0)`), so ANY unhandled error killed the bot. + Changed to LOG ONLY (Hermes/OpenCode pattern) — only SIGINT/SIGTERM trigger clean shutdown. +- **Dual systemd service war** — User-level service (`~/.config/systemd/user/zcode.service`) was + running alongside system-level service, both fighting for port 3001. Masked the user service + permanently (`ln -sf /dev/null zcode.service`). +- **Fragile keepalive** — `await new Promise(() => {})` replaced with `setInterval`-based keepalive + that's robust against V8 optimization. + ### 📄 Files Changed - `src/bot/session-state.js` — Complete rewrite with Hermes guardrail controller (+200 lines) -- `src/bot/index.js` — Parallel tool execution, system prompt overhaul, bash tool guidance (+160 lines) +- `src/bot/index.js` — Parallel tool execution, system prompt overhaul, resilient error handlers (+160 lines) +- `src/bot/index.js` — Fixed syntax error in uncaughtException handler (literal newline in string) - `CHANGELOG.md` — Updated with full v2.0.2 details - `README.md` — Updated header with v2.0.2 summary diff --git a/src/bot/index.js b/src/bot/index.js index 8515882a..eb75295f 100644 --- a/src/bot/index.js +++ b/src/bot/index.js @@ -1407,10 +1407,7 @@ export async function initBot(config, api, tools, skills, agents) { logger.error('Bot error:', err.message || err); }); - // ── Global unhandled rejection guard ── - process.on('unhandledRejection', (reason, promise) => { - logger.error('Unhandled rejection:', reason?.message || reason); - }); + // ── (unhandled rejection handler registered below with gracefulShutdown) ── // ── Graceful shutdown is defined at end of initBot (requires full `svc`) ── @@ -1629,6 +1626,7 @@ export async function initBot(config, api, tools, skills, agents) { try { await bot.stop(); } catch {} // Close HTTP server try { await new Promise(r => httpServer.close(r)); } catch {} + releasePidfile(); logger.info('✓ Shutdown complete'); process.exit(0); }; @@ -1642,13 +1640,28 @@ export async function initBot(config, api, tools, skills, agents) { logger.info(`SIGTERM trace (${process.pid}, PPID=${process.ppid}): ${stack}`); gracefulShutdown('SIGTERM'); }); - process.on('uncaughtException', (e) => { logger.error('💥 Uncaught:', e.message, e.stack); gracefulShutdown('uncaught'); }); - process.on('unhandledRejection', (e) => { logger.error('💥 Unhandled Rejection:', e.message); gracefulShutdown('unhandledRejection'); }); + // ── Resilient error handlers (Hermes/OpenCode pattern) ── + // LOG errors but DON'T kill the process. Only SIGINT/SIGTERM trigger gracefulShutdown. + // uncaughtException: log and continue. Fatal errors will crash anyway — no need to force it. + process.on('uncaughtException', (e) => { + logger.error('💥 Uncaught exception (non-fatal, continuing):', e.message, String(e.stack).slice(0, 300)); + }); + process.on('unhandledRejection', (e) => { + logger.error('💥 Unhandled rejection (non-fatal, continuing):', e?.message || e); + }); return { send: (chatId, text) => bot.api.sendMessage(chatId, markdownToHtml(text), { parse_mode: 'HTML' }), ws: (chatId, msg) => wsClients.get(chatId)?.send(JSON.stringify(msg)), - waitForMessages: async () => { await new Promise(() => {}); }, + waitForMessages: async () => { + // Robust keepalive: setInterval prevents Node.js from exiting + // even if the Promise executor is optimized away by V8 + await new Promise((resolve) => { + const keepalive = setInterval(() => {}, 60000); // 1-min tick keeps event loop alive + // If SIGINT/SIGTERM fires, clearInterval is handled by gracefulShutdown's process.exit + // This promise intentionally never resolves + }); + }, getConnections: () => wsClients.size, // Expose new systems for external use pluginManager: svc.pluginManager,