Stabilize gateway reload/restart behavior and remove doctor --json dependency (#504)

This commit is contained in:
Lingxuan Zuo
2026-03-16 09:47:04 +08:00
committed by GitHub
Unverified
parent 89bda3c7af
commit 7f3408559d
19 changed files with 843 additions and 62 deletions

View File

@@ -43,6 +43,12 @@ import { GatewayConnectionMonitor } from './connection-monitor';
import { GatewayLifecycleController, LifecycleSupersededError } from './lifecycle-controller';
import { launchGatewayProcess } from './process-launcher';
import { GatewayRestartController } from './restart-controller';
import { GatewayRestartGovernor } from './restart-governor';
import {
DEFAULT_GATEWAY_RELOAD_POLICY,
loadGatewayReloadPolicy,
type GatewayReloadPolicy,
} from './reload-policy';
import { classifyGatewayStderrMessage, recordGatewayStartupStderrLine } from './startup-stderr';
import { runGatewayStartupSequence } from './startup-orchestrator';
@@ -94,12 +100,15 @@ export class GatewayManager extends EventEmitter {
private readonly connectionMonitor = new GatewayConnectionMonitor();
private readonly lifecycleController = new GatewayLifecycleController();
private readonly restartController = new GatewayRestartController();
private readonly restartGovernor = new GatewayRestartGovernor();
private reloadDebounceTimer: NodeJS.Timeout | null = null;
private reloadPolicy: GatewayReloadPolicy = { ...DEFAULT_GATEWAY_RELOAD_POLICY };
private reloadPolicyLoadedAt = 0;
private reloadPolicyRefreshPromise: Promise<void> | null = null;
private externalShutdownSupported: boolean | null = null;
private lastRestartAt = 0;
private reconnectAttemptsTotal = 0;
private reconnectSuccessTotal = 0;
private static readonly RESTART_COOLDOWN_MS = 2500;
private static readonly RELOAD_POLICY_REFRESH_MS = 15_000;
constructor(config?: Partial<ReconnectConfig>) {
super();
@@ -109,6 +118,9 @@ export class GatewayManager extends EventEmitter {
this.emit('status', status);
},
onTransition: (previousState, nextState) => {
if (nextState === 'running') {
this.restartGovernor.onRunning();
}
this.restartController.flushDeferredRestart(
`status:${previousState}->${nextState}`,
{
@@ -186,6 +198,7 @@ export class GatewayManager extends EventEmitter {
logger.info(`Gateway start requested (port=${this.status.port})`);
this.lastSpawnSummary = null;
this.shouldReconnect = true;
await this.refreshReloadPolicy(true);
// Lazily load device identity (async file I/O + key generation).
// Must happen before connect() which uses the identity for the handshake.
@@ -353,18 +366,27 @@ export class GatewayManager extends EventEmitter {
return;
}
const now = Date.now();
const sinceLastRestart = now - this.lastRestartAt;
if (sinceLastRestart < GatewayManager.RESTART_COOLDOWN_MS) {
logger.info(
`Gateway restart skipped due to cooldown (${sinceLastRestart}ms < ${GatewayManager.RESTART_COOLDOWN_MS}ms)`,
const decision = this.restartGovernor.decide();
if (!decision.allow) {
const observability = this.restartGovernor.getObservability();
logger.warn(
`[gateway-restart-governor] restart suppressed reason=${decision.reason} retryAfterMs=${decision.retryAfterMs} ` +
`suppressed=${observability.suppressed_total} executed=${observability.executed_total} circuitOpenUntil=${observability.circuit_open_until}`,
);
const props = {
reason: decision.reason,
retry_after_ms: decision.retryAfterMs,
gateway_restart_suppressed_total: observability.suppressed_total,
gateway_restart_executed_total: observability.executed_total,
gateway_restart_circuit_open_until: observability.circuit_open_until,
};
trackMetric('gateway.restart.suppressed', props);
captureTelemetryEvent('gateway_restart_suppressed', props);
return;
}
const pidBefore = this.status.pid;
logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
this.lastRestartAt = now;
this.restartInFlight = (async () => {
await this.stop();
await this.start();
@@ -372,8 +394,18 @@ export class GatewayManager extends EventEmitter {
try {
await this.restartInFlight;
this.restartGovernor.recordExecuted();
const observability = this.restartGovernor.getObservability();
const props = {
gateway_restart_executed_total: observability.executed_total,
gateway_restart_suppressed_total: observability.suppressed_total,
gateway_restart_circuit_open_until: observability.circuit_open_until,
};
trackMetric('gateway.restart.executed', props);
captureTelemetryEvent('gateway_restart_executed', props);
logger.info(
`[gateway-refresh] mode=restart result=applied pidBefore=${pidBefore ?? 'n/a'} pidAfter=${this.status.pid ?? 'n/a'}`,
`[gateway-refresh] mode=restart result=applied pidBefore=${pidBefore ?? 'n/a'} pidAfter=${this.status.pid ?? 'n/a'} ` +
`suppressed=${observability.suppressed_total} executed=${observability.executed_total} circuitOpenUntil=${observability.circuit_open_until}`,
);
} finally {
this.restartInFlight = null;
@@ -413,6 +445,16 @@ export class GatewayManager extends EventEmitter {
* Falls back to restart on unsupported platforms or signaling failures.
*/
async reload(): Promise<void> {
await this.refreshReloadPolicy();
if (this.reloadPolicy.mode === 'off' || this.reloadPolicy.mode === 'restart') {
logger.info(
`[gateway-refresh] mode=reload result=policy_forced_restart policy=${this.reloadPolicy.mode}`,
);
await this.restart();
return;
}
if (this.restartController.isRestartDeferred({
state: this.status.state,
startLock: this.startLock,
@@ -481,17 +523,51 @@ export class GatewayManager extends EventEmitter {
* Debounced reload — coalesces multiple rapid config-change events into one
* in-process reload when possible.
*/
debouncedReload(delayMs = 1200): void {
debouncedReload(delayMs?: number): void {
void this.refreshReloadPolicy();
const effectiveDelay = delayMs ?? this.reloadPolicy.debounceMs;
if (this.reloadPolicy.mode === 'off' || this.reloadPolicy.mode === 'restart') {
logger.debug(
`Gateway reload policy=${this.reloadPolicy.mode}; routing debouncedReload to debouncedRestart (${effectiveDelay}ms)`,
);
this.debouncedRestart(effectiveDelay);
return;
}
if (this.reloadDebounceTimer) {
clearTimeout(this.reloadDebounceTimer);
}
logger.debug(`Gateway reload debounced (will fire in ${delayMs}ms)`);
logger.debug(`Gateway reload debounced (will fire in ${effectiveDelay}ms)`);
this.reloadDebounceTimer = setTimeout(() => {
this.reloadDebounceTimer = null;
void this.reload().catch((err) => {
logger.warn('Debounced Gateway reload failed:', err);
});
}, delayMs);
}, effectiveDelay);
}
private async refreshReloadPolicy(force = false): Promise<void> {
const now = Date.now();
if (!force && now - this.reloadPolicyLoadedAt < GatewayManager.RELOAD_POLICY_REFRESH_MS) {
return;
}
if (this.reloadPolicyRefreshPromise) {
await this.reloadPolicyRefreshPromise;
return;
}
this.reloadPolicyRefreshPromise = (async () => {
const nextPolicy = await loadGatewayReloadPolicy();
this.reloadPolicy = nextPolicy;
this.reloadPolicyLoadedAt = Date.now();
})();
try {
await this.reloadPolicyRefreshPromise;
} finally {
this.reloadPolicyRefreshPromise = null;
}
}
/**