fix(gateway): harden heartbeat timeout recovery to avoid reconnect flapping (#588)

Co-authored-by: zuolingxuan <zuolingxuan@bytedance.com>
This commit is contained in:
Lingxuan Zuo
2026-03-19 14:31:08 +08:00
committed by GitHub
Unverified
parent 8cca9af773
commit 8029b507ba
4 changed files with 241 additions and 41 deletions

View File

@@ -1,46 +1,76 @@
import { logger } from '../utils/logger';
type HealthResult = { ok: boolean; error?: string };
type HeartbeatAliveReason = 'pong' | 'message';
type PingOptions = {
sendPing: () => void;
onHeartbeatTimeout: (context: { consecutiveMisses: number; timeoutMs: number }) => void;
intervalMs?: number;
timeoutMs?: number;
maxConsecutiveMisses?: number;
};
export class GatewayConnectionMonitor {
private pingInterval: NodeJS.Timeout | null = null;
private pongTimeout: NodeJS.Timeout | null = null;
private healthCheckInterval: NodeJS.Timeout | null = null;
private lastPingAt = 0;
private waitingForAlive = false;
private consecutiveMisses = 0;
private timeoutTriggered = false;
startPing(options: PingOptions): void {
const intervalMs = options.intervalMs ?? 30000;
const timeoutMs = options.timeoutMs ?? 10000;
const maxConsecutiveMisses = Math.max(1, options.maxConsecutiveMisses ?? 3);
this.resetHeartbeatState();
startPing(
sendPing: () => void,
onPongTimeout?: () => void,
intervalMs = 30000,
timeoutMs = 15000,
): void {
if (this.pingInterval) {
clearInterval(this.pingInterval);
}
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
this.pongTimeout = null;
}
this.pingInterval = setInterval(() => {
sendPing();
const now = Date.now();
if (onPongTimeout) {
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
if (this.waitingForAlive && now - this.lastPingAt >= timeoutMs) {
this.waitingForAlive = false;
this.consecutiveMisses += 1;
logger.warn(
`Gateway heartbeat missed (${this.consecutiveMisses}/${maxConsecutiveMisses}, timeout=${timeoutMs}ms)`,
);
if (this.consecutiveMisses >= maxConsecutiveMisses && !this.timeoutTriggered) {
this.timeoutTriggered = true;
options.onHeartbeatTimeout({
consecutiveMisses: this.consecutiveMisses,
timeoutMs,
});
return;
}
this.pongTimeout = setTimeout(() => {
this.pongTimeout = null;
onPongTimeout();
}, timeoutMs);
}
options.sendPing();
this.waitingForAlive = true;
this.lastPingAt = now;
}, intervalMs);
}
handlePong(): void {
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
this.pongTimeout = null;
markAlive(reason: HeartbeatAliveReason): void {
// Only log true recovery cases to avoid steady-state heartbeat log spam.
if (this.consecutiveMisses > 0) {
logger.debug(`Gateway heartbeat recovered via ${reason} (misses=${this.consecutiveMisses})`);
}
this.waitingForAlive = false;
this.consecutiveMisses = 0;
this.timeoutTriggered = false;
}
// Backward-compatible alias for old callers.
handlePong(): void {
this.markAlive('pong');
}
getConsecutiveMisses(): number {
return this.consecutiveMisses;
}
startHealthCheck(options: {
@@ -78,13 +108,17 @@ export class GatewayConnectionMonitor {
clearInterval(this.pingInterval);
this.pingInterval = null;
}
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
this.pongTimeout = null;
}
if (this.healthCheckInterval) {
clearInterval(this.healthCheckInterval);
this.healthCheckInterval = null;
}
this.resetHeartbeatState();
}
private resetHeartbeatState(): void {
this.lastPingAt = 0;
this.waitingForAlive = false;
this.consecutiveMisses = 0;
this.timeoutTriggered = false;
}
}