Stabilize gateway reload/restart behavior and remove doctor --json dependency (#504)
This commit is contained in:
committed by
GitHub
Unverified
parent
89bda3c7af
commit
7f3408559d
@@ -43,6 +43,12 @@ import { GatewayConnectionMonitor } from './connection-monitor';
|
||||
import { GatewayLifecycleController, LifecycleSupersededError } from './lifecycle-controller';
|
||||
import { launchGatewayProcess } from './process-launcher';
|
||||
import { GatewayRestartController } from './restart-controller';
|
||||
import { GatewayRestartGovernor } from './restart-governor';
|
||||
import {
|
||||
DEFAULT_GATEWAY_RELOAD_POLICY,
|
||||
loadGatewayReloadPolicy,
|
||||
type GatewayReloadPolicy,
|
||||
} from './reload-policy';
|
||||
import { classifyGatewayStderrMessage, recordGatewayStartupStderrLine } from './startup-stderr';
|
||||
import { runGatewayStartupSequence } from './startup-orchestrator';
|
||||
|
||||
@@ -94,12 +100,15 @@ export class GatewayManager extends EventEmitter {
|
||||
private readonly connectionMonitor = new GatewayConnectionMonitor();
|
||||
private readonly lifecycleController = new GatewayLifecycleController();
|
||||
private readonly restartController = new GatewayRestartController();
|
||||
private readonly restartGovernor = new GatewayRestartGovernor();
|
||||
private reloadDebounceTimer: NodeJS.Timeout | null = null;
|
||||
private reloadPolicy: GatewayReloadPolicy = { ...DEFAULT_GATEWAY_RELOAD_POLICY };
|
||||
private reloadPolicyLoadedAt = 0;
|
||||
private reloadPolicyRefreshPromise: Promise<void> | null = null;
|
||||
private externalShutdownSupported: boolean | null = null;
|
||||
private lastRestartAt = 0;
|
||||
private reconnectAttemptsTotal = 0;
|
||||
private reconnectSuccessTotal = 0;
|
||||
private static readonly RESTART_COOLDOWN_MS = 2500;
|
||||
private static readonly RELOAD_POLICY_REFRESH_MS = 15_000;
|
||||
|
||||
constructor(config?: Partial<ReconnectConfig>) {
|
||||
super();
|
||||
@@ -109,6 +118,9 @@ export class GatewayManager extends EventEmitter {
|
||||
this.emit('status', status);
|
||||
},
|
||||
onTransition: (previousState, nextState) => {
|
||||
if (nextState === 'running') {
|
||||
this.restartGovernor.onRunning();
|
||||
}
|
||||
this.restartController.flushDeferredRestart(
|
||||
`status:${previousState}->${nextState}`,
|
||||
{
|
||||
@@ -186,6 +198,7 @@ export class GatewayManager extends EventEmitter {
|
||||
logger.info(`Gateway start requested (port=${this.status.port})`);
|
||||
this.lastSpawnSummary = null;
|
||||
this.shouldReconnect = true;
|
||||
await this.refreshReloadPolicy(true);
|
||||
|
||||
// Lazily load device identity (async file I/O + key generation).
|
||||
// Must happen before connect() which uses the identity for the handshake.
|
||||
@@ -353,18 +366,27 @@ export class GatewayManager extends EventEmitter {
|
||||
return;
|
||||
}
|
||||
|
||||
const now = Date.now();
|
||||
const sinceLastRestart = now - this.lastRestartAt;
|
||||
if (sinceLastRestart < GatewayManager.RESTART_COOLDOWN_MS) {
|
||||
logger.info(
|
||||
`Gateway restart skipped due to cooldown (${sinceLastRestart}ms < ${GatewayManager.RESTART_COOLDOWN_MS}ms)`,
|
||||
const decision = this.restartGovernor.decide();
|
||||
if (!decision.allow) {
|
||||
const observability = this.restartGovernor.getObservability();
|
||||
logger.warn(
|
||||
`[gateway-restart-governor] restart suppressed reason=${decision.reason} retryAfterMs=${decision.retryAfterMs} ` +
|
||||
`suppressed=${observability.suppressed_total} executed=${observability.executed_total} circuitOpenUntil=${observability.circuit_open_until}`,
|
||||
);
|
||||
const props = {
|
||||
reason: decision.reason,
|
||||
retry_after_ms: decision.retryAfterMs,
|
||||
gateway_restart_suppressed_total: observability.suppressed_total,
|
||||
gateway_restart_executed_total: observability.executed_total,
|
||||
gateway_restart_circuit_open_until: observability.circuit_open_until,
|
||||
};
|
||||
trackMetric('gateway.restart.suppressed', props);
|
||||
captureTelemetryEvent('gateway_restart_suppressed', props);
|
||||
return;
|
||||
}
|
||||
|
||||
const pidBefore = this.status.pid;
|
||||
logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
|
||||
this.lastRestartAt = now;
|
||||
this.restartInFlight = (async () => {
|
||||
await this.stop();
|
||||
await this.start();
|
||||
@@ -372,8 +394,18 @@ export class GatewayManager extends EventEmitter {
|
||||
|
||||
try {
|
||||
await this.restartInFlight;
|
||||
this.restartGovernor.recordExecuted();
|
||||
const observability = this.restartGovernor.getObservability();
|
||||
const props = {
|
||||
gateway_restart_executed_total: observability.executed_total,
|
||||
gateway_restart_suppressed_total: observability.suppressed_total,
|
||||
gateway_restart_circuit_open_until: observability.circuit_open_until,
|
||||
};
|
||||
trackMetric('gateway.restart.executed', props);
|
||||
captureTelemetryEvent('gateway_restart_executed', props);
|
||||
logger.info(
|
||||
`[gateway-refresh] mode=restart result=applied pidBefore=${pidBefore ?? 'n/a'} pidAfter=${this.status.pid ?? 'n/a'}`,
|
||||
`[gateway-refresh] mode=restart result=applied pidBefore=${pidBefore ?? 'n/a'} pidAfter=${this.status.pid ?? 'n/a'} ` +
|
||||
`suppressed=${observability.suppressed_total} executed=${observability.executed_total} circuitOpenUntil=${observability.circuit_open_until}`,
|
||||
);
|
||||
} finally {
|
||||
this.restartInFlight = null;
|
||||
@@ -413,6 +445,16 @@ export class GatewayManager extends EventEmitter {
|
||||
* Falls back to restart on unsupported platforms or signaling failures.
|
||||
*/
|
||||
async reload(): Promise<void> {
|
||||
await this.refreshReloadPolicy();
|
||||
|
||||
if (this.reloadPolicy.mode === 'off' || this.reloadPolicy.mode === 'restart') {
|
||||
logger.info(
|
||||
`[gateway-refresh] mode=reload result=policy_forced_restart policy=${this.reloadPolicy.mode}`,
|
||||
);
|
||||
await this.restart();
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.restartController.isRestartDeferred({
|
||||
state: this.status.state,
|
||||
startLock: this.startLock,
|
||||
@@ -481,17 +523,51 @@ export class GatewayManager extends EventEmitter {
|
||||
* Debounced reload — coalesces multiple rapid config-change events into one
|
||||
* in-process reload when possible.
|
||||
*/
|
||||
debouncedReload(delayMs = 1200): void {
|
||||
debouncedReload(delayMs?: number): void {
|
||||
void this.refreshReloadPolicy();
|
||||
const effectiveDelay = delayMs ?? this.reloadPolicy.debounceMs;
|
||||
if (this.reloadPolicy.mode === 'off' || this.reloadPolicy.mode === 'restart') {
|
||||
logger.debug(
|
||||
`Gateway reload policy=${this.reloadPolicy.mode}; routing debouncedReload to debouncedRestart (${effectiveDelay}ms)`,
|
||||
);
|
||||
this.debouncedRestart(effectiveDelay);
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.reloadDebounceTimer) {
|
||||
clearTimeout(this.reloadDebounceTimer);
|
||||
}
|
||||
logger.debug(`Gateway reload debounced (will fire in ${delayMs}ms)`);
|
||||
logger.debug(`Gateway reload debounced (will fire in ${effectiveDelay}ms)`);
|
||||
this.reloadDebounceTimer = setTimeout(() => {
|
||||
this.reloadDebounceTimer = null;
|
||||
void this.reload().catch((err) => {
|
||||
logger.warn('Debounced Gateway reload failed:', err);
|
||||
});
|
||||
}, delayMs);
|
||||
}, effectiveDelay);
|
||||
}
|
||||
|
||||
private async refreshReloadPolicy(force = false): Promise<void> {
|
||||
const now = Date.now();
|
||||
if (!force && now - this.reloadPolicyLoadedAt < GatewayManager.RELOAD_POLICY_REFRESH_MS) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.reloadPolicyRefreshPromise) {
|
||||
await this.reloadPolicyRefreshPromise;
|
||||
return;
|
||||
}
|
||||
|
||||
this.reloadPolicyRefreshPromise = (async () => {
|
||||
const nextPolicy = await loadGatewayReloadPolicy();
|
||||
this.reloadPolicy = nextPolicy;
|
||||
this.reloadPolicyLoadedAt = Date.now();
|
||||
})();
|
||||
|
||||
try {
|
||||
await this.reloadPolicyRefreshPromise;
|
||||
} finally {
|
||||
this.reloadPolicyRefreshPromise = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user