fix(gateway): make heartbeat observability-only to prevent false cascade restarts (#762)
This commit is contained in:
committed by
GitHub
Unverified
parent
1d2cbf8f26
commit
83f67e1ed3
@@ -112,8 +112,16 @@ export class GatewayManager extends EventEmitter {
|
||||
private static readonly HEARTBEAT_INTERVAL_MS = 30_000;
|
||||
private static readonly HEARTBEAT_TIMEOUT_MS = 12_000;
|
||||
private static readonly HEARTBEAT_MAX_MISSES = 3;
|
||||
// Windows-specific heartbeat parameters — more lenient to reduce log noise
|
||||
// from false positives caused by Windows Defender scans, system updates,
|
||||
// and synchronous event-loop blocking in the gateway.
|
||||
private static readonly HEARTBEAT_INTERVAL_MS_WIN = 60_000;
|
||||
private static readonly HEARTBEAT_TIMEOUT_MS_WIN = 25_000;
|
||||
private static readonly HEARTBEAT_MAX_MISSES_WIN = 5;
|
||||
public static readonly RESTART_COOLDOWN_MS = 5_000;
|
||||
private lastRestartAt = 0;
|
||||
/** Set by scheduleReconnect() before calling start() to signal auto-reconnect. */
|
||||
private isAutoReconnectStart = false;
|
||||
|
||||
constructor(config?: Partial<ReconnectConfig>) {
|
||||
super();
|
||||
@@ -216,8 +224,14 @@ export class GatewayManager extends EventEmitter {
|
||||
logger.debug('Cleared pending reconnect timer because start was requested manually');
|
||||
}
|
||||
|
||||
this.reconnectAttempts = 0;
|
||||
this.setStatus({ state: 'starting', reconnectAttempts: 0 });
|
||||
// Only reset reconnectAttempts on manual start, not on auto-reconnect.
|
||||
// Auto-reconnect calls start() via scheduleReconnect(); those should
|
||||
// accumulate attempts so the maxAttempts cap works correctly.
|
||||
if (!this.isAutoReconnectStart) {
|
||||
this.reconnectAttempts = 0;
|
||||
}
|
||||
this.isAutoReconnectStart = false; // consume the flag
|
||||
this.setStatus({ state: 'starting', reconnectAttempts: this.reconnectAttempts });
|
||||
|
||||
// Check if Python environment is ready (self-healing) asynchronously.
|
||||
// Fire-and-forget: only needs to run once, not on every retry.
|
||||
@@ -365,6 +379,7 @@ export class GatewayManager extends EventEmitter {
|
||||
clearPendingGatewayRequests(this.pendingRequests, new Error('Gateway stopped'));
|
||||
|
||||
this.restartController.resetDeferredRestart();
|
||||
this.isAutoReconnectStart = false;
|
||||
this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined });
|
||||
}
|
||||
|
||||
@@ -881,40 +896,41 @@ export class GatewayManager extends EventEmitter {
|
||||
* Start ping interval to keep connection alive
|
||||
*/
|
||||
private startPing(): void {
|
||||
const isWindows = process.platform === 'win32';
|
||||
this.connectionMonitor.startPing({
|
||||
intervalMs: GatewayManager.HEARTBEAT_INTERVAL_MS,
|
||||
timeoutMs: GatewayManager.HEARTBEAT_TIMEOUT_MS,
|
||||
maxConsecutiveMisses: GatewayManager.HEARTBEAT_MAX_MISSES,
|
||||
intervalMs: isWindows
|
||||
? GatewayManager.HEARTBEAT_INTERVAL_MS_WIN
|
||||
: GatewayManager.HEARTBEAT_INTERVAL_MS,
|
||||
timeoutMs: isWindows
|
||||
? GatewayManager.HEARTBEAT_TIMEOUT_MS_WIN
|
||||
: GatewayManager.HEARTBEAT_TIMEOUT_MS,
|
||||
maxConsecutiveMisses: isWindows
|
||||
? GatewayManager.HEARTBEAT_MAX_MISSES_WIN
|
||||
: GatewayManager.HEARTBEAT_MAX_MISSES,
|
||||
sendPing: () => {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.ping();
|
||||
}
|
||||
},
|
||||
onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => {
|
||||
if (this.status.state !== 'running' || !this.shouldReconnect) {
|
||||
return;
|
||||
}
|
||||
const ws = this.ws;
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Heartbeat timeout is observability-only. We intentionally do NOT
|
||||
// terminate the socket or trigger reconnection here because:
|
||||
//
|
||||
// 1. If the gateway process dies → child.on('exit') fires reliably.
|
||||
// 2. If the socket disconnects → ws.on('close') fires reliably.
|
||||
// 3. If the gateway event loop is blocked (skills scanning, GC,
|
||||
// antivirus) → pong is delayed but the process and connection
|
||||
// are still valid. Terminating the socket would cause a
|
||||
// cascading restart loop for no reason.
|
||||
//
|
||||
// The only scenario ping/pong could catch (silent half-open TCP on
|
||||
// localhost) is practically impossible. So we just log.
|
||||
const pid = this.process?.pid ?? 'unknown';
|
||||
logger.warn(
|
||||
`Gateway heartbeat timed out after ${consecutiveMisses} consecutive misses (timeout=${timeoutMs}ms); terminating stale socket`,
|
||||
`Gateway heartbeat: ${consecutiveMisses} consecutive pong misses ` +
|
||||
`(timeout=${timeoutMs}ms, pid=${pid}, state=${this.status.state}). ` +
|
||||
`No action taken — relying on process exit and socket close events.`,
|
||||
);
|
||||
try {
|
||||
ws.terminate();
|
||||
} catch (error) {
|
||||
logger.warn('Failed to terminate stale Gateway socket after heartbeat timeout:', error);
|
||||
}
|
||||
|
||||
// On Windows, onCloseAfterHandshake intentionally skips scheduleReconnect()
|
||||
// to avoid double-reconnect races with the process exit handler. However,
|
||||
// a heartbeat timeout means the socket is stale while the process may still
|
||||
// be alive (no exit event), so we must explicitly trigger reconnect here.
|
||||
if (process.platform === 'win32') {
|
||||
this.scheduleReconnect();
|
||||
}
|
||||
},
|
||||
});
|
||||
}
|
||||
@@ -979,6 +995,7 @@ export class GatewayManager extends EventEmitter {
|
||||
try {
|
||||
// Use the guarded start() flow so reconnect attempts cannot bypass
|
||||
// lifecycle locking and accidentally start duplicate Gateway processes.
|
||||
this.isAutoReconnectStart = true;
|
||||
await this.start();
|
||||
this.reconnectSuccessTotal += 1;
|
||||
this.emitReconnectMetric('success', {
|
||||
|
||||
@@ -16,7 +16,7 @@ describe('GatewayManager heartbeat recovery', () => {
|
||||
vi.setSystemTime(new Date('2026-03-19T00:00:00.000Z'));
|
||||
});
|
||||
|
||||
it('terminates stale socket only after 3 consecutive heartbeat misses', async () => {
|
||||
it('logs warning but does NOT terminate socket after consecutive heartbeat misses', async () => {
|
||||
const { GatewayManager } = await import('@electron/gateway/manager');
|
||||
const manager = new GatewayManager();
|
||||
|
||||
@@ -39,7 +39,9 @@ describe('GatewayManager heartbeat recovery', () => {
|
||||
vi.advanceTimersByTime(120_000);
|
||||
|
||||
expect(ws.ping).toHaveBeenCalledTimes(3);
|
||||
expect(ws.terminate).toHaveBeenCalledTimes(1);
|
||||
// Heartbeat timeout is now observability-only — socket should NOT be terminated.
|
||||
// Process liveness is detected via child.on('exit'), socket disconnects via ws.on('close').
|
||||
expect(ws.terminate).not.toHaveBeenCalled();
|
||||
|
||||
(manager as unknown as { connectionMonitor: { clear: () => void } }).connectionMonitor.clear();
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user