feat: PortManager — intelligent port lifecycle with retry+backoff

Replace 158 lines of fragile inline port logic (probePort, bindPort,
killStaleProcess, waitForPort, readStalePid) with a proper module:

- State machine: idle → probing → claiming → owned → releasing
- Triple holder detection: pidfile → ss → lsof fallback
- Age-based kill strategy (young siblings get waited on, not killed)
- Exponential backoff retry (5 attempts) instead of instant process.exit
- EventEmitter for stateChange/claimed/retry/failed events
- getStatus() for diagnostics
- Exposed in bot return object for external health checks

All previous features preserved, zero downgrades.
This commit is contained in:
Kilo
2026-05-06 18:10:10 +00:00
Unverified
parent 2b07fff073
commit 1447c48e93
2 changed files with 349 additions and 158 deletions

View File

@@ -7,7 +7,6 @@ import { WebSocketServer } from 'ws';
import fs from 'fs';
import path from 'path';
import { execSync } from 'child_process';
import net from 'net';
import { logger } from '../utils/logger.js';
import { checkEnv } from '../utils/env.js';
import { getRTK } from '../utils/rtk.js';
@@ -31,20 +30,10 @@ import { Agent } from '../agents/Agent.js';
import { Task } from '../agents/Task.js';
import { SwarmCoordinator } from '../agents/SwarmCoordinator.js';
import { JSONBackend, InMemoryBackend, MEMORY_TYPES } from './memory-backend.js';
import { PortManager } from './port-manager.js';
// ── Pidfile: informational only, no process killing ──
const PIDFILE = path.join(process.env.HOME || '/tmp', '.zcode-bot.pid');
function acquirePidfile() {
try {
fs.writeFileSync(PIDFILE, process.pid.toString());
logger.info(`✓ Pidfile: ${PIDFILE} (PID ${process.pid})`);
} catch (e) {
logger.warn(`Pidfile write failed: ${e.message}`);
}
}
function releasePidfile() {
try { fs.unlinkSync(PIDFILE); } catch {}
}
// ── PortManager handles pidfile + port lifecycle (see port-manager.js) ──
function buildSessionKey(chatId, threadId) {
return threadId ? `${chatId}:${threadId}` : String(chatId);
@@ -1430,9 +1419,15 @@ export async function initBot(config, api, tools, skills, agents) {
// ── (unhandled rejection handler registered below with gracefulShutdown) ──
// ── Graceful shutdown is defined at end of initBot (requires full `svc`) ──
acquirePidfile();
// ── PortManager: smart port lifecycle (claim, retry, recover) ──
const PIDFILE = path.join(process.env.HOME || '/tmp', '.zcode-bot.pid');
const portManager = new PortManager({
port: PORT,
pidfile: PIDFILE,
maxAttempts: 5,
baseDelayMs: 500,
maxDelayMs: 5000,
});
// ── Express + WebSocket server (keep for webhook compatibility) ──
const app = express();
@@ -1487,148 +1482,16 @@ export async function initBot(config, api, tools, skills, agents) {
const PORT = process.env.ZCODE_PORT || 3000;
// ── Robust port binding: pidfile-based stale detection, no fuser ──
// Strategy (inspired by Next.js, Vite, webpack-dev-server):
// 1. Read pidfile for stale process, SIGTERM it if still alive
// 2. Probe port with a disposable net socket to confirm availability
// 3. Bind httpServer after port is confirmed free
// This avoids the fuser race condition where fuser returns the current
// process PID because the socket is already half-open.
function readStalePid() {
try {
const pid = parseInt(fs.readFileSync(PIDFILE, 'utf8').trim(), 10);
if (!isNaN(pid) && pid !== process.pid) return pid;
} catch {}
return null;
// ── Claim port via PortManager (retry + stale recovery + backoff) ──
try {
await portManager.claim(httpServer);
logger.info(`✓ HTTP on :${PORT} · WS ready · grammy bot online`);
logger.info(`${svc.tools.length} tools · ${svc.skills.length} skills · ${svc.agents.length} agents`);
} catch (err) {
logger.error(`❌ Port ${PORT} unavailable after retries: ${err.message}`);
process.exit(1);
}
function isProcessAlive(pid) {
try { process.kill(pid, 0); return true; } catch { return false; }
}
function killStaleProcess(pid) {
if (!isProcessAlive(pid)) return false;
// Guard: don't kill processes younger than 15 seconds (prevents crash-loop
// where systemd restarts before old instance finishes dying, causing mutual kills)
try {
const stat = fs.readFileSync(`/proc/${pid}/stat`, 'utf8');
// Field 22 in /proc/pid/stat is starttime (in jiffies since boot)
const fields = stat.split(')');
if (fields.length > 1) {
const statFields = fields[1].trim().split(/\s+/);
const startTimeTicks = parseInt(statFields[19], 10);
if (!isNaN(startTimeTicks)) {
const bootTime = fs.readFileSync('/proc/stat', 'utf8')
.split('\n').find(l => l.startsWith('btime '))?.split(/\s+/)[1];
if (bootTime) {
const startTimeMs = (parseInt(bootTime) + startTimeTicks / 100) * 1000;
const age = Date.now() - startTimeMs;
if (age < 15000) {
logger.warn(` Skipping PID ${pid} — only ${Math.round(age / 1000)}s old (crash-loop guard)`);
return false;
}
}
}
}
} catch {
// /proc not available (non-Linux) — skip guard
}
try {
process.kill(pid, 'SIGTERM');
logger.warn(` Sent SIGTERM to stale PID ${pid}`);
return true;
} catch (e) {
logger.warn(` Failed to kill stale PID ${pid}: ${e.message}`);
return false;
}
}
function probePort(port) {
return new Promise((resolve) => {
const sock = net.createServer();
sock.listen(port, () => {
sock.close(() => resolve(false)); // port is free
});
sock.on('error', () => resolve(true)); // port is in use
});
}
async function waitForPort(port, maxMs = 5000) {
const start = Date.now();
while (Date.now() - start < maxMs) {
const inUse = await probePort(port);
if (!inUse) return true;
await new Promise(r => setTimeout(r, 300));
}
return false;
}
async function bindPort() {
// Ensure pidfile is current before port probe
acquirePidfile();
const inUse = await probePort(PORT);
if (inUse) {
logger.warn(`⚠ Port ${PORT} in use — checking for stale process`);
const stalePid = readStalePid();
if (stalePid) {
const killed = killStaleProcess(stalePid);
if (killed) {
const freed = await waitForPort(PORT);
if (!freed) {
logger.error(`❌ Port ${PORT} still occupied after killing stale PID ${stalePid}`);
process.exit(1);
}
logger.info(`✓ Port ${PORT} freed after stale process cleanup`);
}
} else {
// No stale pidfile — try to find the process via ss
try {
const ssOut = execSync(`ss -tlnp 'sport = :${PORT}' 2>/dev/null`, { encoding: 'utf8' }).trim();
const pidMatch = ssOut.match(/pid=(\d+)/);
if (pidMatch) {
const stalePid = parseInt(pidMatch[1]);
if (!isNaN(stalePid) && stalePid !== process.pid) {
const killed = killStaleProcess(stalePid);
if (killed) {
const freed = await waitForPort(PORT);
if (freed) {
logger.info(`✓ Port ${PORT} freed after killing PID ${stalePid} (detected via ss)`);
} else {
logger.error(`❌ Port ${PORT} still occupied after killing PID ${stalePid}`);
process.exit(1);
}
}
}
} else {
logger.error(`❌ Port ${PORT} occupied by unknown process (no pidfile, ss couldn't identify). Free it manually or change ZCODE_PORT.`);
process.exit(1);
}
} catch {
logger.error(`❌ Port ${PORT} occupied by unknown process. Free it manually or change ZCODE_PORT.`);
process.exit(1);
}
}
}
// Bind the server
await new Promise((resolve, reject) => {
httpServer.listen(PORT, () => {
logger.info(`✓ HTTP on :${PORT} · WS ready · grammy bot online`);
logger.info(`${svc.tools.length} tools · ${svc.skills.length} skills · ${svc.agents.length} agents`);
resolve();
});
httpServer.once('error', (err) => {
reject(err);
});
}).catch((err) => {
logger.error(`❌ Failed to bind port ${PORT}: ${err.message}`);
process.exit(1);
});
}
await bindPort();
@@ -1672,7 +1535,7 @@ export async function initBot(config, api, tools, skills, agents) {
try { await bot.stop(); } catch {}
// Close HTTP server
try { await new Promise(r => httpServer.close(r)); } catch {}
releasePidfile();
portManager.release();
logger.info('✓ Shutdown complete');
process.exit(0);
};
@@ -1715,6 +1578,7 @@ export async function initBot(config, api, tools, skills, agents) {
hookManager: svc.hooks,
memBackend: svc.memBackend,
agentOrchestrator: svc.agentOrchestrator,
portManager,
getState: () => ({ tools: svc.tools.length, skills: svc.skills.length, agents: svc.agents.length, plugins: svc.pluginManager?.getPlugins()?.length || 0, wsClients: wsClients.size }),
};
}