feat(gateway): add websocket pong timeout and channel auto-reconnect watchdog (#569)

This commit is contained in:
paisley
2026-03-18 13:25:30 +08:00
committed by GitHub
Unverified
parent 554f894493
commit 1eda50ef44
4 changed files with 102 additions and 7 deletions

View File

@@ -4,18 +4,45 @@ type HealthResult = { ok: boolean; error?: string };
export class GatewayConnectionMonitor {
private pingInterval: NodeJS.Timeout | null = null;
private pongTimeout: NodeJS.Timeout | null = null;
private healthCheckInterval: NodeJS.Timeout | null = null;
startPing(sendPing: () => void, intervalMs = 30000): void {
startPing(
sendPing: () => void,
onPongTimeout?: () => void,
intervalMs = 30000,
timeoutMs = 15000,
): void {
if (this.pingInterval) {
clearInterval(this.pingInterval);
}
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
this.pongTimeout = null;
}
this.pingInterval = setInterval(() => {
sendPing();
if (onPongTimeout) {
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
}
this.pongTimeout = setTimeout(() => {
this.pongTimeout = null;
onPongTimeout();
}, timeoutMs);
}
}, intervalMs);
}
handlePong(): void {
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
this.pongTimeout = null;
}
}
startHealthCheck(options: {
shouldCheck: () => boolean;
checkHealth: () => Promise<HealthResult>;
@@ -51,6 +78,10 @@ export class GatewayConnectionMonitor {
clearInterval(this.pingInterval);
this.pingInterval = null;
}
if (this.pongTimeout) {
clearTimeout(this.pongTimeout);
this.pongTimeout = null;
}
if (this.healthCheckInterval) {
clearInterval(this.healthCheckInterval);
this.healthCheckInterval = null;

View File

@@ -109,6 +109,8 @@ export class GatewayManager extends EventEmitter {
private reconnectAttemptsTotal = 0;
private reconnectSuccessTotal = 0;
private static readonly RELOAD_POLICY_REFRESH_MS = 15_000;
public static readonly RESTART_COOLDOWN_MS = 5_000;
private lastRestartAt = 0;
constructor(config?: Partial<ReconnectConfig>) {
super();
@@ -727,6 +729,9 @@ export class GatewayManager extends EventEmitter {
getToken: async () => await import('../utils/store').then(({ getSetting }) => getSetting('gatewayToken')),
onHandshakeComplete: (ws) => {
this.ws = ws;
this.ws.on('pong', () => {
this.connectionMonitor.handlePong();
});
this.setStatus({
state: 'running',
port,
@@ -802,11 +807,24 @@ export class GatewayManager extends EventEmitter {
* Start ping interval to keep connection alive
*/
private startPing(): void {
this.connectionMonitor.startPing(() => {
this.connectionMonitor.startPing(
() => {
if (this.ws?.readyState === WebSocket.OPEN) {
this.ws.ping();
}
});
},
() => {
logger.error('Gateway WebSocket dead connection detected (pong timeout)');
if (this.ws) {
this.ws.terminate(); // Force close the dead connection immediately
this.ws = null;
}
if (this.status.state === 'running') {
this.setStatus({ state: 'error', error: 'WebSocket ping timeout' });
this.scheduleReconnect();
}
}
);
}
/**

View File

@@ -33,8 +33,13 @@ interface ChannelsState {
setChannels: (channels: Channel[]) => void;
updateChannel: (channelId: string, updates: Partial<Channel>) => void;
clearError: () => void;
scheduleAutoReconnect: (channelId: string) => void;
clearAutoReconnect: (channelId: string) => void;
}
const reconnectTimers = new Map<string, NodeJS.Timeout>();
const reconnectAttempts = new Map<string, number>();
export const useChannelsStore = create<ChannelsState>((set, get) => ({
channels: [],
loading: false,
@@ -194,7 +199,8 @@ export const useChannelsStore = create<ChannelsState>((set, get) => ({
},
disconnectChannel: async (channelId) => {
const { updateChannel } = get();
const { updateChannel, clearAutoReconnect } = get();
clearAutoReconnect(channelId);
try {
await useGatewayStore.getState().rpc('channels.disconnect', { channelId });
@@ -223,4 +229,37 @@ export const useChannelsStore = create<ChannelsState>((set, get) => ({
},
clearError: () => set({ error: null }),
scheduleAutoReconnect: (channelId) => {
if (reconnectTimers.has(channelId)) return;
const attempts = reconnectAttempts.get(channelId) || 0;
// Exponential backoff capped at 2 minutes
const delay = Math.min(5000 * Math.pow(2, attempts), 120000);
console.log(`[Watchdog] Scheduling auto-reconnect for ${channelId} in ${delay}ms (attempt ${attempts + 1})`);
const timer = setTimeout(() => {
reconnectTimers.delete(channelId);
const state = get();
const channel = state.channels.find((c) => c.id === channelId);
if (channel && (channel.status === 'disconnected' || channel.status === 'error')) {
reconnectAttempts.set(channelId, attempts + 1);
console.log(`[Watchdog] Executing auto-reconnect for ${channelId} (attempt ${attempts + 1})`);
state.connectChannel(channelId).catch(() => {});
}
}, delay);
reconnectTimers.set(channelId, timer);
},
clearAutoReconnect: (channelId) => {
const timer = reconnectTimers.get(channelId);
if (timer) {
clearTimeout(timer);
reconnectTimers.delete(channelId);
}
reconnectAttempts.delete(channelId);
},
}));

View File

@@ -269,7 +269,14 @@ export const useGatewayStore = create<GatewayState>((set, get) => ({
const state = useChannelsStore.getState();
const channel = state.channels.find((item) => item.type === update.channelId);
if (channel) {
state.updateChannel(channel.id, { status: mapChannelStatus(update.status) });
const newStatus = mapChannelStatus(update.status);
state.updateChannel(channel.id, { status: newStatus });
if (newStatus === 'disconnected' || newStatus === 'error') {
state.scheduleAutoReconnect(channel.id);
} else if (newStatus === 'connected' || newStatus === 'connecting') {
state.clearAutoReconnect(channel.id);
}
}
})
.catch(() => {});