Optimize gateway comms reload behavior and strengthen regression coverage (#496)

This commit is contained in:
Lingxuan Zuo
2026-03-15 20:36:48 +08:00
committed by GitHub
Unverified
parent 08960d700f
commit 1dbe4a8466
36 changed files with 1511 additions and 197 deletions

View File

@@ -17,18 +17,8 @@ export function dispatchProtocolEvent(
emitter.emit('chat:message', { message: payload });
break;
case 'agent': {
const p = payload as Record<string, unknown>;
const data = (p.data && typeof p.data === 'object') ? p.data as Record<string, unknown> : {};
const chatEvent: Record<string, unknown> = {
...data,
runId: p.runId ?? data.runId,
sessionKey: p.sessionKey ?? data.sessionKey,
state: p.state ?? data.state,
message: p.message ?? data.message,
};
if (chatEvent.state || chatEvent.message) {
emitter.emit('chat:message', { message: chatEvent });
}
// Keep "agent" on the canonical notification path to avoid double
// handling in renderer when both notification and chat-message are wired.
emitter.emit('notification', { method: event, params: payload });
break;
}

View File

@@ -9,6 +9,7 @@ import WebSocket from 'ws';
import { PORTS } from '../utils/config';
import { JsonRpcNotification, isNotification, isResponse } from './protocol';
import { logger } from '../utils/logger';
import { captureTelemetryEvent, trackMetric } from '../utils/telemetry';
import {
loadOrCreateDeviceIdentity,
type DeviceIdentity,
@@ -95,6 +96,10 @@ export class GatewayManager extends EventEmitter {
private readonly restartController = new GatewayRestartController();
private reloadDebounceTimer: NodeJS.Timeout | null = null;
private externalShutdownSupported: boolean | null = null;
private lastRestartAt = 0;
private reconnectAttemptsTotal = 0;
private reconnectSuccessTotal = 0;
private static readonly RESTART_COOLDOWN_MS = 2500;
constructor(config?: Partial<ReconnectConfig>) {
super();
@@ -348,7 +353,18 @@ export class GatewayManager extends EventEmitter {
return;
}
logger.debug('Gateway restart requested');
const now = Date.now();
const sinceLastRestart = now - this.lastRestartAt;
if (sinceLastRestart < GatewayManager.RESTART_COOLDOWN_MS) {
logger.info(
`Gateway restart skipped due to cooldown (${sinceLastRestart}ms < ${GatewayManager.RESTART_COOLDOWN_MS}ms)`,
);
return;
}
const pidBefore = this.status.pid;
logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
this.lastRestartAt = now;
this.restartInFlight = (async () => {
await this.stop();
await this.start();
@@ -356,6 +372,9 @@ export class GatewayManager extends EventEmitter {
try {
await this.restartInFlight;
logger.info(
`[gateway-refresh] mode=restart result=applied pidBefore=${pidBefore ?? 'n/a'} pidAfter=${this.status.pid ?? 'n/a'}`,
);
} finally {
this.restartInFlight = null;
this.restartController.flushDeferredRestart(
@@ -405,13 +424,18 @@ export class GatewayManager extends EventEmitter {
return;
}
const pidBefore = this.process?.pid;
logger.info(`[gateway-refresh] mode=reload requested pid=${pidBefore ?? 'n/a'} state=${this.status.state}`);
if (!this.process?.pid || this.status.state !== 'running') {
logger.warn('[gateway-refresh] mode=reload result=fallback_restart cause=not_running');
logger.warn('Gateway reload requested while not running; falling back to restart');
await this.restart();
return;
}
if (process.platform === 'win32') {
logger.warn('[gateway-refresh] mode=reload result=fallback_restart cause=windows');
logger.debug('Windows detected, falling back to Gateway restart for reload');
await this.restart();
return;
@@ -423,6 +447,9 @@ export class GatewayManager extends EventEmitter {
// Avoid signaling a process that just came up; it will already read latest config.
if (connectedForMs < 8000) {
logger.info(
`[gateway-refresh] mode=reload result=skipped_recent_connect connectedForMs=${connectedForMs} pid=${this.process.pid}`,
);
logger.info(`Gateway connected ${connectedForMs}ms ago, skipping reload signal`);
return;
}
@@ -434,10 +461,17 @@ export class GatewayManager extends EventEmitter {
// If process state doesn't recover quickly, fall back to restart.
await new Promise((resolve) => setTimeout(resolve, 1500));
if (this.status.state !== 'running' || !this.process?.pid) {
logger.warn('[gateway-refresh] mode=reload result=fallback_restart cause=post_signal_unhealthy');
logger.warn('Gateway did not stay running after reload signal, falling back to restart');
await this.restart();
} else {
const pidAfter = this.process.pid;
logger.info(
`[gateway-refresh] mode=reload result=applied_in_place pidBefore=${pidBefore} pidAfter=${pidAfter}`,
);
}
} catch (error) {
logger.warn('[gateway-refresh] mode=reload result=fallback_restart cause=signal_error');
logger.warn('Gateway reload signal failed, falling back to restart:', error);
await this.restart();
}
@@ -731,9 +765,11 @@ export class GatewayManager extends EventEmitter {
return;
}
const cooldownRemaining = Math.max(0, GatewayManager.RESTART_COOLDOWN_MS - (Date.now() - this.lastRestartAt));
const { delay, nextAttempt, maxAttempts } = decision;
const effectiveDelay = Math.max(delay, cooldownRemaining);
this.reconnectAttempts = nextAttempt;
logger.warn(`Scheduling Gateway reconnect attempt ${nextAttempt}/${maxAttempts} in ${delay}ms`);
logger.warn(`Scheduling Gateway reconnect attempt ${nextAttempt}/${maxAttempts} in ${effectiveDelay}ms`);
this.setStatus({
state: 'reconnecting',
@@ -752,16 +788,58 @@ export class GatewayManager extends EventEmitter {
logger.debug(`Skipping reconnect attempt: ${skipReason}`);
return;
}
const attemptNo = this.reconnectAttempts;
this.reconnectAttemptsTotal += 1;
try {
// Use the guarded start() flow so reconnect attempts cannot bypass
// lifecycle locking and accidentally start duplicate Gateway processes.
await this.start();
this.reconnectSuccessTotal += 1;
this.emitReconnectMetric('success', {
attemptNo,
maxAttempts,
delayMs: effectiveDelay,
});
this.reconnectAttempts = 0;
} catch (error) {
logger.error('Gateway reconnection attempt failed:', error);
this.emitReconnectMetric('failure', {
attemptNo,
maxAttempts,
delayMs: effectiveDelay,
error: error instanceof Error ? error.message : String(error),
});
this.scheduleReconnect();
}
}, delay);
}, effectiveDelay);
}
private emitReconnectMetric(
outcome: 'success' | 'failure',
payload: {
attemptNo: number;
maxAttempts: number;
delayMs: number;
error?: string;
},
): void {
const successRate = this.reconnectAttemptsTotal > 0
? this.reconnectSuccessTotal / this.reconnectAttemptsTotal
: 0;
const properties = {
outcome,
attemptNo: payload.attemptNo,
maxAttempts: payload.maxAttempts,
delayMs: payload.delayMs,
gateway_reconnect_success_count: this.reconnectSuccessTotal,
gateway_reconnect_attempt_count: this.reconnectAttemptsTotal,
gateway_reconnect_success_rate: Number(successRate.toFixed(4)),
...(payload.error ? { error: payload.error } : {}),
};
trackMetric('gateway.reconnect', properties);
captureTelemetryEvent('gateway_reconnect', properties);
}
/**
@@ -770,4 +848,4 @@ export class GatewayManager extends EventEmitter {
private setStatus(update: Partial<GatewayStatus>): void {
this.stateController.setStatus(update);
}
}
}

View File

@@ -18,6 +18,12 @@ export function classifyGatewayStderrMessage(message: string): GatewayStderrClas
if (msg.includes('closed before connect') && msg.includes('token mismatch')) {
return { level: 'drop', normalized: msg };
}
if (msg.includes('[ws] closed before connect') && msg.includes('code=1005')) {
return { level: 'debug', normalized: msg };
}
if (msg.includes('security warning: dangerous config flags enabled')) {
return { level: 'debug', normalized: msg };
}
// Downgrade frequent non-fatal noise.
if (msg.includes('ExperimentalWarning')) return { level: 'debug', normalized: msg };