fix(gateway): harden heartbeat timeout recovery to avoid reconnect flapping (#588)
Co-authored-by: zuolingxuan <zuolingxuan@bytedance.com>
This commit is contained in:
committed by
GitHub
Unverified
parent
8cca9af773
commit
8029b507ba
@@ -109,6 +109,9 @@ export class GatewayManager extends EventEmitter {
|
||||
private reconnectAttemptsTotal = 0;
|
||||
private reconnectSuccessTotal = 0;
|
||||
private static readonly RELOAD_POLICY_REFRESH_MS = 15_000;
|
||||
private static readonly HEARTBEAT_INTERVAL_MS = 30_000;
|
||||
private static readonly HEARTBEAT_TIMEOUT_MS = 12_000;
|
||||
private static readonly HEARTBEAT_MAX_MISSES = 3;
|
||||
public static readonly RESTART_COOLDOWN_MS = 5_000;
|
||||
private lastRestartAt = 0;
|
||||
|
||||
@@ -694,6 +697,7 @@ export class GatewayManager extends EventEmitter {
|
||||
onExit: (exitedChild, code) => {
|
||||
this.processExitCode = code;
|
||||
this.ownsProcess = false;
|
||||
this.connectionMonitor.clear();
|
||||
if (this.process === exitedChild) {
|
||||
this.process = null;
|
||||
}
|
||||
@@ -729,8 +733,8 @@ export class GatewayManager extends EventEmitter {
|
||||
getToken: async () => await import('../utils/store').then(({ getSetting }) => getSetting('gatewayToken')),
|
||||
onHandshakeComplete: (ws) => {
|
||||
this.ws = ws;
|
||||
this.ws.on('pong', () => {
|
||||
this.connectionMonitor.handlePong();
|
||||
ws.on('pong', () => {
|
||||
this.connectionMonitor.markAlive('pong');
|
||||
});
|
||||
this.setStatus({
|
||||
state: 'running',
|
||||
@@ -743,6 +747,7 @@ export class GatewayManager extends EventEmitter {
|
||||
this.handleMessage(message);
|
||||
},
|
||||
onCloseAfterHandshake: () => {
|
||||
this.connectionMonitor.clear();
|
||||
if (this.status.state === 'running') {
|
||||
this.setStatus({ state: 'stopped' });
|
||||
this.scheduleReconnect();
|
||||
@@ -755,6 +760,8 @@ export class GatewayManager extends EventEmitter {
|
||||
* Handle incoming WebSocket message
|
||||
*/
|
||||
private handleMessage(message: unknown): void {
|
||||
this.connectionMonitor.markAlive('message');
|
||||
|
||||
if (typeof message !== 'object' || message === null) {
|
||||
logger.debug('Received non-object Gateway message');
|
||||
return;
|
||||
@@ -807,24 +814,34 @@ export class GatewayManager extends EventEmitter {
|
||||
* Start ping interval to keep connection alive
|
||||
*/
|
||||
private startPing(): void {
|
||||
this.connectionMonitor.startPing(
|
||||
() => {
|
||||
this.connectionMonitor.startPing({
|
||||
intervalMs: GatewayManager.HEARTBEAT_INTERVAL_MS,
|
||||
timeoutMs: GatewayManager.HEARTBEAT_TIMEOUT_MS,
|
||||
maxConsecutiveMisses: GatewayManager.HEARTBEAT_MAX_MISSES,
|
||||
sendPing: () => {
|
||||
if (this.ws?.readyState === WebSocket.OPEN) {
|
||||
this.ws.ping();
|
||||
}
|
||||
},
|
||||
() => {
|
||||
logger.error('Gateway WebSocket dead connection detected (pong timeout)');
|
||||
if (this.ws) {
|
||||
this.ws.terminate(); // Force close the dead connection immediately
|
||||
this.ws = null;
|
||||
onHeartbeatTimeout: ({ consecutiveMisses, timeoutMs }) => {
|
||||
if (this.status.state !== 'running' || !this.shouldReconnect) {
|
||||
return;
|
||||
}
|
||||
if (this.status.state === 'running') {
|
||||
this.setStatus({ state: 'error', error: 'WebSocket ping timeout' });
|
||||
this.scheduleReconnect();
|
||||
const ws = this.ws;
|
||||
if (!ws || ws.readyState !== WebSocket.OPEN) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
logger.warn(
|
||||
`Gateway heartbeat timed out after ${consecutiveMisses} consecutive misses (timeout=${timeoutMs}ms); terminating stale socket`,
|
||||
);
|
||||
try {
|
||||
ws.terminate();
|
||||
} catch (error) {
|
||||
logger.warn('Failed to terminate stale Gateway socket after heartbeat timeout:', error);
|
||||
}
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user