fix(gateway): make heartbeat observability-only to prevent false cascade restarts (#762)

This commit is contained in:
paisley
2026-04-03 14:50:06 +08:00
committed by GitHub
Unverified
parent 1d2cbf8f26
commit 83f67e1ed3
2 changed files with 48 additions and 29 deletions

View File

@@ -112,8 +112,16 @@ export class GatewayManager extends EventEmitter {
private static readonly HEARTBEAT_INTERVAL_MS = 30_000; private static readonly HEARTBEAT_INTERVAL_MS = 30_000;
private static readonly HEARTBEAT_TIMEOUT_MS = 12_000; private static readonly HEARTBEAT_TIMEOUT_MS = 12_000;
private static readonly HEARTBEAT_MAX_MISSES = 3; private static readonly HEARTBEAT_MAX_MISSES = 3;
// Windows-specific heartbeat parameters — more lenient to reduce log noise
// from false positives caused by Windows Defender scans, system updates,
// and synchronous event-loop blocking in the gateway.
private static readonly HEARTBEAT_INTERVAL_MS_WIN = 60_000;
private static readonly HEARTBEAT_TIMEOUT_MS_WIN = 25_000;
private static readonly HEARTBEAT_MAX_MISSES_WIN = 5;
public static readonly RESTART_COOLDOWN_MS = 5_000; public static readonly RESTART_COOLDOWN_MS = 5_000;
private lastRestartAt = 0; private lastRestartAt = 0;
/** Set by scheduleReconnect() before calling start() to signal auto-reconnect. */
private isAutoReconnectStart = false;
constructor(config?: Partial<ReconnectConfig>) { constructor(config?: Partial<ReconnectConfig>) {
super(); super();
@@ -216,8 +224,14 @@ export class GatewayManager extends EventEmitter {
logger.debug('Cleared pending reconnect timer because start was requested manually'); logger.debug('Cleared pending reconnect timer because start was requested manually');
} }
this.reconnectAttempts = 0; // Only reset reconnectAttempts on manual start, not on auto-reconnect.
this.setStatus({ state: 'starting', reconnectAttempts: 0 }); // Auto-reconnect calls start() via scheduleReconnect(); those should
// accumulate attempts so the maxAttempts cap works correctly.
if (!this.isAutoReconnectStart) {
this.reconnectAttempts = 0;
}
this.isAutoReconnectStart = false; // consume the flag
this.setStatus({ state: 'starting', reconnectAttempts: this.reconnectAttempts });
// Check if Python environment is ready (self-healing) asynchronously. // Check if Python environment is ready (self-healing) asynchronously.
// Fire-and-forget: only needs to run once, not on every retry. // Fire-and-forget: only needs to run once, not on every retry.
@@ -365,6 +379,7 @@ export class GatewayManager extends EventEmitter {
clearPendingGatewayRequests(this.pendingRequests, new Error('Gateway stopped')); clearPendingGatewayRequests(this.pendingRequests, new Error('Gateway stopped'));
this.restartController.resetDeferredRestart(); this.restartController.resetDeferredRestart();
this.isAutoReconnectStart = false;
this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined }); this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined });
} }
@@ -881,40 +896,41 @@ export class GatewayManager extends EventEmitter {
* Start ping interval to keep connection alive * Start ping interval to keep connection alive
*/ */
private startPing(): void { private startPing(): void {
const isWindows = process.platform === 'win32';
this.connectionMonitor.startPing({ this.connectionMonitor.startPing({
intervalMs: GatewayManager.HEARTBEAT_INTERVAL_MS, intervalMs: isWindows
timeoutMs: GatewayManager.HEARTBEAT_TIMEOUT_MS, ? GatewayManager.HEARTBEAT_INTERVAL_MS_WIN
maxConsecutiveMisses: GatewayManager.HEARTBEAT_MAX_MISSES, : GatewayManager.HEARTBEAT_INTERVAL_MS,
timeoutMs: isWindows
? GatewayManager.HEARTBEAT_TIMEOUT_MS_WIN
: GatewayManager.HEARTBEAT_TIMEOUT_MS,
maxConsecutiveMisses: isWindows
? GatewayManager.HEARTBEAT_MAX_MISSES_WIN
: GatewayManager.HEARTBEAT_MAX_MISSES,
sendPing: () => { sendPing: () => {
if (this.ws?.readyState === WebSocket.OPEN) { if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.ping(); this.ws.ping();
} }
}, },
onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => { onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => {
if (this.status.state !== 'running' || !this.shouldReconnect) { // Heartbeat timeout is observability-only. We intentionally do NOT
return; // terminate the socket or trigger reconnection here because:
} //
const ws = this.ws; // 1. If the gateway process dies → child.on('exit') fires reliably.
if (!ws || ws.readyState !== WebSocket.OPEN) { // 2. If the socket disconnects → ws.on('close') fires reliably.
return; // 3. If the gateway event loop is blocked (skills scanning, GC,
} // antivirus) → pong is delayed but the process and connection
// are still valid. Terminating the socket would cause a
// cascading restart loop for no reason.
//
// The only scenario ping/pong could catch (silent half-open TCP on
// localhost) is practically impossible. So we just log.
const pid = this.process?.pid ?? 'unknown';
logger.warn( logger.warn(
`Gateway heartbeat timed out after ${consecutiveMisses} consecutive misses (timeout=${timeoutMs}ms); terminating stale socket`, `Gateway heartbeat: ${consecutiveMisses} consecutive pong misses ` +
`(timeout=${timeoutMs}ms, pid=${pid}, state=${this.status.state}). ` +
`No action taken — relying on process exit and socket close events.`,
); );
try {
ws.terminate();
} catch (error) {
logger.warn('Failed to terminate stale Gateway socket after heartbeat timeout:', error);
}
// On Windows, onCloseAfterHandshake intentionally skips scheduleReconnect()
// to avoid double-reconnect races with the process exit handler. However,
// a heartbeat timeout means the socket is stale while the process may still
// be alive (no exit event), so we must explicitly trigger reconnect here.
if (process.platform === 'win32') {
this.scheduleReconnect();
}
}, },
}); });
} }
@@ -979,6 +995,7 @@ export class GatewayManager extends EventEmitter {
try { try {
// Use the guarded start() flow so reconnect attempts cannot bypass // Use the guarded start() flow so reconnect attempts cannot bypass
// lifecycle locking and accidentally start duplicate Gateway processes. // lifecycle locking and accidentally start duplicate Gateway processes.
this.isAutoReconnectStart = true;
await this.start(); await this.start();
this.reconnectSuccessTotal += 1; this.reconnectSuccessTotal += 1;
this.emitReconnectMetric('success', { this.emitReconnectMetric('success', {

View File

@@ -16,7 +16,7 @@ describe('GatewayManager heartbeat recovery', () => {
vi.setSystemTime(new Date('2026-03-19T00:00:00.000Z')); vi.setSystemTime(new Date('2026-03-19T00:00:00.000Z'));
}); });
it('terminates stale socket only after 3 consecutive heartbeat misses', async () => { it('logs warning but does NOT terminate socket after consecutive heartbeat misses', async () => {
const { GatewayManager } = await import('@electron/gateway/manager'); const { GatewayManager } = await import('@electron/gateway/manager');
const manager = new GatewayManager(); const manager = new GatewayManager();
@@ -39,7 +39,9 @@ describe('GatewayManager heartbeat recovery', () => {
vi.advanceTimersByTime(120_000); vi.advanceTimersByTime(120_000);
expect(ws.ping).toHaveBeenCalledTimes(3); expect(ws.ping).toHaveBeenCalledTimes(3);
expect(ws.terminate).toHaveBeenCalledTimes(1); // Heartbeat timeout is now observability-only — socket should NOT be terminated.
// Process liveness is detected via child.on('exit'), socket disconnects via ws.on('close').
expect(ws.terminate).not.toHaveBeenCalled();
(manager as unknown as { connectionMonitor: { clear: () => void } }).connectionMonitor.clear(); (manager as unknown as { connectionMonitor: { clear: () => void } }).connectionMonitor.clear();
}); });