fix(gateway): make heartbeat observability-only to prevent false cascade restarts (#762)
This commit is contained in:
committed by
GitHub
Unverified
parent
1d2cbf8f26
commit
83f67e1ed3
@@ -112,8 +112,16 @@ export class GatewayManager extends EventEmitter {
|
|||||||
private static readonly HEARTBEAT_INTERVAL_MS = 30_000;
|
private static readonly HEARTBEAT_INTERVAL_MS = 30_000;
|
||||||
private static readonly HEARTBEAT_TIMEOUT_MS = 12_000;
|
private static readonly HEARTBEAT_TIMEOUT_MS = 12_000;
|
||||||
private static readonly HEARTBEAT_MAX_MISSES = 3;
|
private static readonly HEARTBEAT_MAX_MISSES = 3;
|
||||||
|
// Windows-specific heartbeat parameters — more lenient to reduce log noise
|
||||||
|
// from false positives caused by Windows Defender scans, system updates,
|
||||||
|
// and synchronous event-loop blocking in the gateway.
|
||||||
|
private static readonly HEARTBEAT_INTERVAL_MS_WIN = 60_000;
|
||||||
|
private static readonly HEARTBEAT_TIMEOUT_MS_WIN = 25_000;
|
||||||
|
private static readonly HEARTBEAT_MAX_MISSES_WIN = 5;
|
||||||
public static readonly RESTART_COOLDOWN_MS = 5_000;
|
public static readonly RESTART_COOLDOWN_MS = 5_000;
|
||||||
private lastRestartAt = 0;
|
private lastRestartAt = 0;
|
||||||
|
/** Set by scheduleReconnect() before calling start() to signal auto-reconnect. */
|
||||||
|
private isAutoReconnectStart = false;
|
||||||
|
|
||||||
constructor(config?: Partial<ReconnectConfig>) {
|
constructor(config?: Partial<ReconnectConfig>) {
|
||||||
super();
|
super();
|
||||||
@@ -216,8 +224,14 @@ export class GatewayManager extends EventEmitter {
|
|||||||
logger.debug('Cleared pending reconnect timer because start was requested manually');
|
logger.debug('Cleared pending reconnect timer because start was requested manually');
|
||||||
}
|
}
|
||||||
|
|
||||||
this.reconnectAttempts = 0;
|
// Only reset reconnectAttempts on manual start, not on auto-reconnect.
|
||||||
this.setStatus({ state: 'starting', reconnectAttempts: 0 });
|
// Auto-reconnect calls start() via scheduleReconnect(); those should
|
||||||
|
// accumulate attempts so the maxAttempts cap works correctly.
|
||||||
|
if (!this.isAutoReconnectStart) {
|
||||||
|
this.reconnectAttempts = 0;
|
||||||
|
}
|
||||||
|
this.isAutoReconnectStart = false; // consume the flag
|
||||||
|
this.setStatus({ state: 'starting', reconnectAttempts: this.reconnectAttempts });
|
||||||
|
|
||||||
// Check if Python environment is ready (self-healing) asynchronously.
|
// Check if Python environment is ready (self-healing) asynchronously.
|
||||||
// Fire-and-forget: only needs to run once, not on every retry.
|
// Fire-and-forget: only needs to run once, not on every retry.
|
||||||
@@ -365,6 +379,7 @@ export class GatewayManager extends EventEmitter {
|
|||||||
clearPendingGatewayRequests(this.pendingRequests, new Error('Gateway stopped'));
|
clearPendingGatewayRequests(this.pendingRequests, new Error('Gateway stopped'));
|
||||||
|
|
||||||
this.restartController.resetDeferredRestart();
|
this.restartController.resetDeferredRestart();
|
||||||
|
this.isAutoReconnectStart = false;
|
||||||
this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined });
|
this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined });
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -881,40 +896,41 @@ export class GatewayManager extends EventEmitter {
|
|||||||
* Start ping interval to keep connection alive
|
* Start ping interval to keep connection alive
|
||||||
*/
|
*/
|
||||||
private startPing(): void {
|
private startPing(): void {
|
||||||
|
const isWindows = process.platform === 'win32';
|
||||||
this.connectionMonitor.startPing({
|
this.connectionMonitor.startPing({
|
||||||
intervalMs: GatewayManager.HEARTBEAT_INTERVAL_MS,
|
intervalMs: isWindows
|
||||||
timeoutMs: GatewayManager.HEARTBEAT_TIMEOUT_MS,
|
? GatewayManager.HEARTBEAT_INTERVAL_MS_WIN
|
||||||
maxConsecutiveMisses: GatewayManager.HEARTBEAT_MAX_MISSES,
|
: GatewayManager.HEARTBEAT_INTERVAL_MS,
|
||||||
|
timeoutMs: isWindows
|
||||||
|
? GatewayManager.HEARTBEAT_TIMEOUT_MS_WIN
|
||||||
|
: GatewayManager.HEARTBEAT_TIMEOUT_MS,
|
||||||
|
maxConsecutiveMisses: isWindows
|
||||||
|
? GatewayManager.HEARTBEAT_MAX_MISSES_WIN
|
||||||
|
: GatewayManager.HEARTBEAT_MAX_MISSES,
|
||||||
sendPing: () => {
|
sendPing: () => {
|
||||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||||
this.ws.ping();
|
this.ws.ping();
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => {
|
onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => {
|
||||||
if (this.status.state !== 'running' || !this.shouldReconnect) {
|
// Heartbeat timeout is observability-only. We intentionally do NOT
|
||||||
return;
|
// terminate the socket or trigger reconnection here because:
|
||||||
}
|
//
|
||||||
const ws = this.ws;
|
// 1. If the gateway process dies → child.on('exit') fires reliably.
|
||||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
// 2. If the socket disconnects → ws.on('close') fires reliably.
|
||||||
return;
|
// 3. If the gateway event loop is blocked (skills scanning, GC,
|
||||||
}
|
// antivirus) → pong is delayed but the process and connection
|
||||||
|
// are still valid. Terminating the socket would cause a
|
||||||
|
// cascading restart loop for no reason.
|
||||||
|
//
|
||||||
|
// The only scenario ping/pong could catch (silent half-open TCP on
|
||||||
|
// localhost) is practically impossible. So we just log.
|
||||||
|
const pid = this.process?.pid ?? 'unknown';
|
||||||
logger.warn(
|
logger.warn(
|
||||||
`Gateway heartbeat timed out after ${consecutiveMisses} consecutive misses (timeout=${timeoutMs}ms); terminating stale socket`,
|
`Gateway heartbeat: ${consecutiveMisses} consecutive pong misses ` +
|
||||||
|
`(timeout=${timeoutMs}ms, pid=${pid}, state=${this.status.state}). ` +
|
||||||
|
`No action taken — relying on process exit and socket close events.`,
|
||||||
);
|
);
|
||||||
try {
|
|
||||||
ws.terminate();
|
|
||||||
} catch (error) {
|
|
||||||
logger.warn('Failed to terminate stale Gateway socket after heartbeat timeout:', error);
|
|
||||||
}
|
|
||||||
|
|
||||||
// On Windows, onCloseAfterHandshake intentionally skips scheduleReconnect()
|
|
||||||
// to avoid double-reconnect races with the process exit handler. However,
|
|
||||||
// a heartbeat timeout means the socket is stale while the process may still
|
|
||||||
// be alive (no exit event), so we must explicitly trigger reconnect here.
|
|
||||||
if (process.platform === 'win32') {
|
|
||||||
this.scheduleReconnect();
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
@@ -979,6 +995,7 @@ export class GatewayManager extends EventEmitter {
|
|||||||
try {
|
try {
|
||||||
// Use the guarded start() flow so reconnect attempts cannot bypass
|
// Use the guarded start() flow so reconnect attempts cannot bypass
|
||||||
// lifecycle locking and accidentally start duplicate Gateway processes.
|
// lifecycle locking and accidentally start duplicate Gateway processes.
|
||||||
|
this.isAutoReconnectStart = true;
|
||||||
await this.start();
|
await this.start();
|
||||||
this.reconnectSuccessTotal += 1;
|
this.reconnectSuccessTotal += 1;
|
||||||
this.emitReconnectMetric('success', {
|
this.emitReconnectMetric('success', {
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ describe('GatewayManager heartbeat recovery', () => {
|
|||||||
vi.setSystemTime(new Date('2026-03-19T00:00:00.000Z'));
|
vi.setSystemTime(new Date('2026-03-19T00:00:00.000Z'));
|
||||||
});
|
});
|
||||||
|
|
||||||
it('terminates stale socket only after 3 consecutive heartbeat misses', async () => {
|
it('logs warning but does NOT terminate socket after consecutive heartbeat misses', async () => {
|
||||||
const { GatewayManager } = await import('@electron/gateway/manager');
|
const { GatewayManager } = await import('@electron/gateway/manager');
|
||||||
const manager = new GatewayManager();
|
const manager = new GatewayManager();
|
||||||
|
|
||||||
@@ -39,7 +39,9 @@ describe('GatewayManager heartbeat recovery', () => {
|
|||||||
vi.advanceTimersByTime(120_000);
|
vi.advanceTimersByTime(120_000);
|
||||||
|
|
||||||
expect(ws.ping).toHaveBeenCalledTimes(3);
|
expect(ws.ping).toHaveBeenCalledTimes(3);
|
||||||
expect(ws.terminate).toHaveBeenCalledTimes(1);
|
// Heartbeat timeout is now observability-only — socket should NOT be terminated.
|
||||||
|
// Process liveness is detected via child.on('exit'), socket disconnects via ws.on('close').
|
||||||
|
expect(ws.terminate).not.toHaveBeenCalled();
|
||||||
|
|
||||||
(manager as unknown as { connectionMonitor: { clear: () => void } }).connectionMonitor.clear();
|
(manager as unknown as { connectionMonitor: { clear: () => void } }).connectionMonitor.clear();
|
||||||
});
|
});
|
||||||
|
|||||||
Reference in New Issue
Block a user