diff --git a/electron/gateway/manager.ts b/electron/gateway/manager.ts index 7d23cdd77..58f3cc74a 100644 --- a/electron/gateway/manager.ts +++ b/electron/gateway/manager.ts @@ -112,8 +112,16 @@ export class GatewayManager extends EventEmitter { private static readonly HEARTBEAT_INTERVAL_MS = 30_000; private static readonly HEARTBEAT_TIMEOUT_MS = 12_000; private static readonly HEARTBEAT_MAX_MISSES = 3; + // Windows-specific heartbeat parameters — more lenient to reduce log noise + // from false positives caused by Windows Defender scans, system updates, + // and synchronous event-loop blocking in the gateway. + private static readonly HEARTBEAT_INTERVAL_MS_WIN = 60_000; + private static readonly HEARTBEAT_TIMEOUT_MS_WIN = 25_000; + private static readonly HEARTBEAT_MAX_MISSES_WIN = 5; public static readonly RESTART_COOLDOWN_MS = 5_000; private lastRestartAt = 0; + /** Set by scheduleReconnect() before calling start() to signal auto-reconnect. */ + private isAutoReconnectStart = false; constructor(config?: Partial) { super(); @@ -216,8 +224,14 @@ export class GatewayManager extends EventEmitter { logger.debug('Cleared pending reconnect timer because start was requested manually'); } - this.reconnectAttempts = 0; - this.setStatus({ state: 'starting', reconnectAttempts: 0 }); + // Only reset reconnectAttempts on manual start, not on auto-reconnect. + // Auto-reconnect calls start() via scheduleReconnect(); those should + // accumulate attempts so the maxAttempts cap works correctly. + if (!this.isAutoReconnectStart) { + this.reconnectAttempts = 0; + } + this.isAutoReconnectStart = false; // consume the flag + this.setStatus({ state: 'starting', reconnectAttempts: this.reconnectAttempts }); // Check if Python environment is ready (self-healing) asynchronously. // Fire-and-forget: only needs to run once, not on every retry. @@ -365,6 +379,7 @@ export class GatewayManager extends EventEmitter { clearPendingGatewayRequests(this.pendingRequests, new Error('Gateway stopped')); this.restartController.resetDeferredRestart(); + this.isAutoReconnectStart = false; this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined }); } @@ -881,40 +896,41 @@ export class GatewayManager extends EventEmitter { * Start ping interval to keep connection alive */ private startPing(): void { + const isWindows = process.platform === 'win32'; this.connectionMonitor.startPing({ - intervalMs: GatewayManager.HEARTBEAT_INTERVAL_MS, - timeoutMs: GatewayManager.HEARTBEAT_TIMEOUT_MS, - maxConsecutiveMisses: GatewayManager.HEARTBEAT_MAX_MISSES, + intervalMs: isWindows + ? GatewayManager.HEARTBEAT_INTERVAL_MS_WIN + : GatewayManager.HEARTBEAT_INTERVAL_MS, + timeoutMs: isWindows + ? GatewayManager.HEARTBEAT_TIMEOUT_MS_WIN + : GatewayManager.HEARTBEAT_TIMEOUT_MS, + maxConsecutiveMisses: isWindows + ? GatewayManager.HEARTBEAT_MAX_MISSES_WIN + : GatewayManager.HEARTBEAT_MAX_MISSES, sendPing: () => { if (this.ws?.readyState === WebSocket.OPEN) { this.ws.ping(); } }, onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => { - if (this.status.state !== 'running' || !this.shouldReconnect) { - return; - } - const ws = this.ws; - if (!ws || ws.readyState !== WebSocket.OPEN) { - return; - } - + // Heartbeat timeout is observability-only. We intentionally do NOT + // terminate the socket or trigger reconnection here because: + // + // 1. If the gateway process dies → child.on('exit') fires reliably. + // 2. If the socket disconnects → ws.on('close') fires reliably. + // 3. If the gateway event loop is blocked (skills scanning, GC, + // antivirus) → pong is delayed but the process and connection + // are still valid. Terminating the socket would cause a + // cascading restart loop for no reason. + // + // The only scenario ping/pong could catch (silent half-open TCP on + // localhost) is practically impossible. So we just log. + const pid = this.process?.pid ?? 'unknown'; logger.warn( - `Gateway heartbeat timed out after ${consecutiveMisses} consecutive misses (timeout=${timeoutMs}ms); terminating stale socket`, + `Gateway heartbeat: ${consecutiveMisses} consecutive pong misses ` + + `(timeout=${timeoutMs}ms, pid=${pid}, state=${this.status.state}). ` + + `No action taken — relying on process exit and socket close events.`, ); - try { - ws.terminate(); - } catch (error) { - logger.warn('Failed to terminate stale Gateway socket after heartbeat timeout:', error); - } - - // On Windows, onCloseAfterHandshake intentionally skips scheduleReconnect() - // to avoid double-reconnect races with the process exit handler. However, - // a heartbeat timeout means the socket is stale while the process may still - // be alive (no exit event), so we must explicitly trigger reconnect here. - if (process.platform === 'win32') { - this.scheduleReconnect(); - } }, }); } @@ -979,6 +995,7 @@ export class GatewayManager extends EventEmitter { try { // Use the guarded start() flow so reconnect attempts cannot bypass // lifecycle locking and accidentally start duplicate Gateway processes. + this.isAutoReconnectStart = true; await this.start(); this.reconnectSuccessTotal += 1; this.emitReconnectMetric('success', { diff --git a/tests/unit/gateway-manager-heartbeat.test.ts b/tests/unit/gateway-manager-heartbeat.test.ts index e7d68949f..d1d834474 100644 --- a/tests/unit/gateway-manager-heartbeat.test.ts +++ b/tests/unit/gateway-manager-heartbeat.test.ts @@ -16,7 +16,7 @@ describe('GatewayManager heartbeat recovery', () => { vi.setSystemTime(new Date('2026-03-19T00:00:00.000Z')); }); - it('terminates stale socket only after 3 consecutive heartbeat misses', async () => { + it('logs warning but does NOT terminate socket after consecutive heartbeat misses', async () => { const { GatewayManager } = await import('@electron/gateway/manager'); const manager = new GatewayManager(); @@ -39,7 +39,9 @@ describe('GatewayManager heartbeat recovery', () => { vi.advanceTimersByTime(120_000); expect(ws.ping).toHaveBeenCalledTimes(3); - expect(ws.terminate).toHaveBeenCalledTimes(1); + // Heartbeat timeout is now observability-only — socket should NOT be terminated. + // Process liveness is detected via child.on('exit'), socket disconnects via ws.on('close'). + expect(ws.terminate).not.toHaveBeenCalled(); (manager as unknown as { connectionMonitor: { clear: () => void } }).connectionMonitor.clear(); });