Add channel health diagnostics and gateway recovery fixes (#855)

This commit is contained in:
Lingxuan Zuo
2026-04-15 13:51:02 +08:00
committed by GitHub
Unverified
parent 6acd8acf5a
commit 1f39d1a8a7
22 changed files with 1868 additions and 52 deletions

View File

@@ -1,4 +1,5 @@
export type ChannelConnectionStatus = 'connected' | 'connecting' | 'disconnected' | 'error';
export type GatewayHealthState = 'healthy' | 'degraded' | 'unresponsive';
export type ChannelConnectionStatus = 'connected' | 'connecting' | 'degraded' | 'disconnected' | 'error';
export interface ChannelRuntimeAccountSnapshot {
connected?: boolean;
@@ -19,6 +20,10 @@ export interface ChannelRuntimeSummarySnapshot {
lastError?: string | null;
}
export interface ChannelHealthOverlay {
gatewayHealthState?: GatewayHealthState;
}
const RECENT_ACTIVITY_MS = 10 * 60 * 1000;
function hasNonEmptyError(value: string | null | undefined): boolean {
@@ -74,9 +79,11 @@ export function isChannelRuntimeConnected(
export function computeChannelRuntimeStatus(
account: ChannelRuntimeAccountSnapshot,
healthOverlay?: ChannelHealthOverlay,
): ChannelConnectionStatus {
if (isChannelRuntimeConnected(account)) return 'connected';
if (hasChannelRuntimeError(account)) return 'error';
if (healthOverlay?.gatewayHealthState && healthOverlay.gatewayHealthState !== 'healthy') return 'degraded';
if (isChannelRuntimeConnected(account)) return 'connected';
if (account.running === true) return 'connecting';
return 'disconnected';
}
@@ -84,6 +91,7 @@ export function computeChannelRuntimeStatus(
export function pickChannelRuntimeStatus(
accounts: ChannelRuntimeAccountSnapshot[],
summary?: ChannelRuntimeSummarySnapshot,
healthOverlay?: ChannelHealthOverlay,
): ChannelConnectionStatus {
if (accounts.some((account) => isChannelRuntimeConnected(account))) {
return 'connected';
@@ -93,6 +101,10 @@ export function pickChannelRuntimeStatus(
return 'error';
}
if (healthOverlay?.gatewayHealthState && healthOverlay.gatewayHealthState !== 'healthy') {
return 'degraded';
}
if (accounts.some((account) => account.running === true)) {
return 'connecting';
}

View File

@@ -0,0 +1,81 @@
import type {
GatewayDiagnosticsSnapshot,
GatewayHealthSummary,
GatewayStatus,
} from '../gateway/manager';
type BuildGatewayHealthSummaryOptions = {
status: GatewayStatus;
diagnostics: GatewayDiagnosticsSnapshot;
lastChannelsStatusOkAt?: number;
lastChannelsStatusFailureAt?: number;
platform?: string;
now?: number;
};
const CHANNEL_STATUS_FAILURE_WINDOW_MS = 2 * 60_000;
const HEARTBEAT_MISS_THRESHOLD_DEFAULT = 3;
const HEARTBEAT_MISS_THRESHOLD_WIN = 5;
export function buildGatewayHealthSummary(
options: BuildGatewayHealthSummaryOptions,
): GatewayHealthSummary {
const now = options.now ?? Date.now();
const reasons = new Set<string>();
const heartbeatThreshold = options.platform === 'win32'
? HEARTBEAT_MISS_THRESHOLD_WIN
: HEARTBEAT_MISS_THRESHOLD_DEFAULT;
const channelStatusFailureIsRecent =
typeof options.lastChannelsStatusFailureAt === 'number'
&& now - options.lastChannelsStatusFailureAt <= CHANNEL_STATUS_FAILURE_WINDOW_MS
&& (
typeof options.lastChannelsStatusOkAt !== 'number'
|| options.lastChannelsStatusFailureAt > options.lastChannelsStatusOkAt
);
if (options.status.state !== 'running') {
reasons.add(options.status.state === 'error' ? 'gateway_error' : 'gateway_not_running');
return {
state: 'degraded',
reasons: [...reasons],
consecutiveHeartbeatMisses: options.diagnostics.consecutiveHeartbeatMisses,
lastAliveAt: options.diagnostics.lastAliveAt,
lastRpcSuccessAt: options.diagnostics.lastRpcSuccessAt,
lastRpcFailureAt: options.diagnostics.lastRpcFailureAt,
lastRpcFailureMethod: options.diagnostics.lastRpcFailureMethod,
lastChannelsStatusOkAt: options.lastChannelsStatusOkAt,
lastChannelsStatusFailureAt: options.lastChannelsStatusFailureAt,
};
}
if (options.diagnostics.consecutiveHeartbeatMisses >= heartbeatThreshold) {
reasons.add('gateway_unresponsive');
} else if (options.diagnostics.consecutiveHeartbeatMisses > 0) {
reasons.add('gateway_degraded');
}
if (options.diagnostics.consecutiveRpcFailures > 0) {
reasons.add('rpc_timeout');
}
if (channelStatusFailureIsRecent) {
reasons.add('channels_status_timeout');
}
return {
state: reasons.has('gateway_unresponsive')
? 'unresponsive'
: reasons.size > 0
? 'degraded'
: 'healthy',
reasons: [...reasons],
consecutiveHeartbeatMisses: options.diagnostics.consecutiveHeartbeatMisses,
lastAliveAt: options.diagnostics.lastAliveAt,
lastRpcSuccessAt: options.diagnostics.lastRpcSuccessAt,
lastRpcFailureAt: options.diagnostics.lastRpcFailureAt,
lastRpcFailureMethod: options.diagnostics.lastRpcFailureMethod,
lastChannelsStatusOkAt: options.lastChannelsStatusOkAt,
lastChannelsStatusFailureAt: options.lastChannelsStatusFailureAt,
};
}

View File

@@ -1634,6 +1634,51 @@ export async function sanitizeOpenClawConfig(): Promise<void> {
pluginsObj.allow = allowArr;
}
// ── acpx legacy config/install cleanup ─────────────────────
// Older OpenClaw releases allowed plugins.entries.acpx.config.command
// and expectedVersion overrides. Current bundled acpx schema rejects
// them, which causes the Gateway to fail validation before startup.
// Strip those keys and drop stale installs metadata that still points
// at an older bundled OpenClaw tree so the current bundled plugin can
// be re-registered cleanly.
const acpxEntry = isPlainRecord(pEntries.acpx) ? pEntries.acpx as Record<string, unknown> : null;
const acpxConfig = acpxEntry && isPlainRecord(acpxEntry.config)
? acpxEntry.config as Record<string, unknown>
: null;
if (acpxConfig) {
for (const legacyKey of ['command', 'expectedVersion'] as const) {
if (legacyKey in acpxConfig) {
delete acpxConfig[legacyKey];
modified = true;
console.log(`[sanitize] Removed legacy plugins.entries.acpx.config.${legacyKey}`);
}
}
}
const installs = isPlainRecord(pluginsObj.installs) ? pluginsObj.installs as Record<string, unknown> : null;
const acpxInstall = installs && isPlainRecord(installs.acpx) ? installs.acpx as Record<string, unknown> : null;
if (acpxInstall) {
const currentBundledAcpxDir = join(getOpenClawResolvedDir(), 'dist', 'extensions', 'acpx').replace(/\\/g, '/');
const sourcePath = typeof acpxInstall.sourcePath === 'string' ? acpxInstall.sourcePath : '';
const installPath = typeof acpxInstall.installPath === 'string' ? acpxInstall.installPath : '';
const normalizedSourcePath = sourcePath.replace(/\\/g, '/');
const normalizedInstallPath = installPath.replace(/\\/g, '/');
const pointsAtDifferentBundledTree = [normalizedSourcePath, normalizedInstallPath].some(
(candidate) => candidate.includes('/node_modules/.pnpm/openclaw@') && candidate !== currentBundledAcpxDir,
);
const pointsAtMissingPath = (sourcePath && !(await fileExists(sourcePath)))
|| (installPath && !(await fileExists(installPath)));
if (pointsAtDifferentBundledTree || pointsAtMissingPath) {
delete installs.acpx;
if (Object.keys(installs).length === 0) {
delete pluginsObj.installs;
}
modified = true;
console.log('[sanitize] Removed stale plugins.installs.acpx metadata');
}
}
const installedFeishuId = await resolveInstalledFeishuPluginId();
const configuredFeishuId =
FEISHU_PLUGIN_ID_CANDIDATES.find((id) => allowArr.includes(id))