Add channel health diagnostics and gateway recovery fixes (#855)

This commit is contained in:
Lingxuan Zuo
2026-04-15 13:51:02 +08:00
committed by GitHub
Unverified
parent 6acd8acf5a
commit 1f39d1a8a7
22 changed files with 1868 additions and 52 deletions

View File

@@ -33,7 +33,9 @@ import {
import {
computeChannelRuntimeStatus,
pickChannelRuntimeStatus,
type ChannelConnectionStatus,
type ChannelRuntimeAccountSnapshot,
type GatewayHealthState,
} from '../../utils/channel-status';
import {
OPENCLAW_WECHAT_CHANNEL_TYPE,
@@ -65,6 +67,8 @@ import {
normalizeWhatsAppMessagingTarget,
} from '../../utils/openclaw-sdk';
import { logger } from '../../utils/logger';
import { buildGatewayHealthSummary } from '../../utils/gateway-health';
import type { GatewayHealthSummary } from '../../gateway/manager';
// listWhatsAppDirectory*FromConfig were removed from openclaw's public exports
// in 2026.3.23-1. No-op stubs; WhatsApp target picker uses session discovery.
@@ -405,7 +409,8 @@ interface ChannelAccountView {
running: boolean;
linked: boolean;
lastError?: string;
status: 'connected' | 'connecting' | 'disconnected' | 'error';
status: ChannelConnectionStatus;
statusReason?: string;
isDefault: boolean;
agentId?: string;
}
@@ -413,10 +418,34 @@ interface ChannelAccountView {
interface ChannelAccountsView {
channelType: string;
defaultAccountId: string;
status: 'connected' | 'connecting' | 'disconnected' | 'error';
status: ChannelConnectionStatus;
statusReason?: string;
accounts: ChannelAccountView[];
}
export function getChannelStatusDiagnostics(): {
lastChannelsStatusOkAt?: number;
lastChannelsStatusFailureAt?: number;
} {
return {
lastChannelsStatusOkAt,
lastChannelsStatusFailureAt,
};
}
function gatewayHealthStateForChannels(
gatewayHealthState: GatewayHealthState,
): GatewayHealthState | undefined {
return gatewayHealthState === 'healthy' ? undefined : gatewayHealthState;
}
function overlayStatusReason(
gatewayHealth: GatewayHealthSummary,
fallbackReason: string,
): string {
return gatewayHealth.reasons[0] || fallbackReason;
}
function buildGatewayStatusSnapshot(status: GatewayChannelStatusPayload | null): string {
if (!status?.channelAccounts) return 'none';
const entries = Object.entries(status.channelAccounts);
@@ -480,11 +509,13 @@ type DirectoryEntry = {
const CHANNEL_TARGET_CACHE_TTL_MS = 60_000;
const CHANNEL_TARGET_CACHE_ENABLED = process.env.VITEST !== 'true';
const channelTargetCache = new Map<string, { expiresAt: number; targets: ChannelTargetOptionView[] }>();
let lastChannelsStatusOkAt: number | undefined;
let lastChannelsStatusFailureAt: number | undefined;
async function buildChannelAccountsView(
export async function buildChannelAccountsView(
ctx: HostApiContext,
options?: { probe?: boolean },
): Promise<ChannelAccountsView[]> {
): Promise<{ channels: ChannelAccountsView[]; gatewayHealth: GatewayHealthSummary }> {
const startedAt = Date.now();
// Read config once and share across all sub-calls (was 5 readFile calls before).
const openClawConfig = await readOpenClawConfig();
@@ -507,17 +538,32 @@ async function buildChannelAccountsView(
{ probe },
probe ? 5000 : 8000,
);
lastChannelsStatusOkAt = Date.now();
logger.info(
`[channels.accounts] channels.status probe=${probe ? '1' : '0'} elapsedMs=${Date.now() - rpcStartedAt} snapshot=${buildGatewayStatusSnapshot(gatewayStatus)}`
);
} catch {
const probe = options?.probe === true;
lastChannelsStatusFailureAt = Date.now();
logger.warn(
`[channels.accounts] channels.status probe=${probe ? '1' : '0'} failed after ${Date.now() - startedAt}ms`
);
gatewayStatus = null;
}
const gatewayDiagnostics = ctx.gatewayManager.getDiagnostics?.() ?? {
consecutiveHeartbeatMisses: 0,
consecutiveRpcFailures: 0,
};
const gatewayHealth = buildGatewayHealthSummary({
status: ctx.gatewayManager.getStatus(),
diagnostics: gatewayDiagnostics,
lastChannelsStatusOkAt,
lastChannelsStatusFailureAt,
platform: process.platform,
});
const gatewayHealthState = gatewayHealthStateForChannels(gatewayHealth.state);
const channelTypes = new Set<string>([
...configuredChannels,
...Object.keys(configuredAccounts),
@@ -566,7 +612,9 @@ async function buildChannelAccountsView(
const accounts: ChannelAccountView[] = accountIds.map((accountId) => {
const runtime = runtimeAccounts.find((item) => item.accountId === accountId);
const runtimeSnapshot: ChannelRuntimeAccountSnapshot = runtime ?? {};
const status = computeChannelRuntimeStatus(runtimeSnapshot);
const status = computeChannelRuntimeStatus(runtimeSnapshot, {
gatewayHealthState,
});
return {
accountId,
name: runtime?.name || accountId,
@@ -576,6 +624,11 @@ async function buildChannelAccountsView(
linked: runtime?.linked === true,
lastError: typeof runtime?.lastError === 'string' ? runtime.lastError : undefined,
status,
statusReason: status === 'degraded'
? overlayStatusReason(gatewayHealth, 'gateway_degraded')
: status === 'error'
? 'runtime_error'
: undefined,
isDefault: accountId === defaultAccountId,
agentId: agentsSnapshot.channelAccountOwners[`${rawChannelType}:${accountId}`],
};
@@ -585,10 +638,32 @@ async function buildChannelAccountsView(
return left.accountId.localeCompare(right.accountId);
});
const visibleAccountSnapshots: ChannelRuntimeAccountSnapshot[] = accounts.map((account) => ({
connected: account.connected,
running: account.running,
linked: account.linked,
lastError: account.lastError,
}));
const hasRuntimeError = visibleAccountSnapshots.some((account) => typeof account.lastError === 'string' && account.lastError.trim())
|| Boolean(channelSummary?.error?.trim() || channelSummary?.lastError?.trim());
const baseGroupStatus = pickChannelRuntimeStatus(visibleAccountSnapshots, channelSummary);
const groupStatus = !gatewayStatus && ctx.gatewayManager.getStatus().state === 'running'
? 'degraded'
: gatewayHealthState && !hasRuntimeError && baseGroupStatus === 'connected'
? 'degraded'
: pickChannelRuntimeStatus(visibleAccountSnapshots, channelSummary, {
gatewayHealthState,
});
channels.push({
channelType: uiChannelType,
defaultAccountId,
status: pickChannelRuntimeStatus(runtimeAccounts, channelSummary),
status: groupStatus,
statusReason: !gatewayStatus && ctx.gatewayManager.getStatus().state === 'running'
? 'channels_status_timeout'
: groupStatus === 'degraded'
? overlayStatusReason(gatewayHealth, 'gateway_degraded')
: undefined,
accounts,
});
}
@@ -597,7 +672,7 @@ async function buildChannelAccountsView(
logger.info(
`[channels.accounts] response probe=${options?.probe === true ? '1' : '0'} elapsedMs=${Date.now() - startedAt} view=${sorted.map((item) => `${item.channelType}:${item.status}`).join(',')}`
);
return sorted;
return { channels: sorted, gatewayHealth };
}
function buildChannelTargetLabel(baseLabel: string, value: string): string {
@@ -1193,8 +1268,8 @@ export async function handleChannelRoutes(
try {
const probe = url.searchParams.get('probe') === '1';
logger.info(`[channels.accounts] request probe=${probe ? '1' : '0'}`);
const channels = await buildChannelAccountsView(ctx, { probe });
sendJson(res, 200, { success: true, channels });
const { channels, gatewayHealth } = await buildChannelAccountsView(ctx, { probe });
sendJson(res, 200, { success: true, channels, gatewayHealth });
} catch (error) {
sendJson(res, 500, { success: false, error: String(error) });
}

View File

@@ -0,0 +1,86 @@
import { open } from 'node:fs/promises';
import { join } from 'node:path';
import type { IncomingMessage, ServerResponse } from 'http';
import { logger } from '../../utils/logger';
import { getOpenClawConfigDir } from '../../utils/paths';
import { buildGatewayHealthSummary } from '../../utils/gateway-health';
import type { HostApiContext } from '../context';
import { sendJson } from '../route-utils';
import { buildChannelAccountsView, getChannelStatusDiagnostics } from './channels';
const DEFAULT_TAIL_LINES = 200;
async function readTail(filePath: string, tailLines = DEFAULT_TAIL_LINES): Promise<string> {
const safeTailLines = Math.max(1, Math.floor(tailLines));
try {
const file = await open(filePath, 'r');
try {
const stat = await file.stat();
if (stat.size === 0) return '';
const chunkSize = 64 * 1024;
let position = stat.size;
let content = '';
let lineCount = 0;
while (position > 0 && lineCount <= safeTailLines) {
const bytesToRead = Math.min(chunkSize, position);
position -= bytesToRead;
const buffer = Buffer.allocUnsafe(bytesToRead);
const { bytesRead } = await file.read(buffer, 0, bytesToRead, position);
content = `${buffer.subarray(0, bytesRead).toString('utf-8')}${content}`;
lineCount = content.split('\n').length - 1;
}
const lines = content.split('\n');
return lines.length <= safeTailLines ? content : lines.slice(-safeTailLines).join('\n');
} finally {
await file.close();
}
} catch {
return '';
}
}
export async function handleDiagnosticsRoutes(
req: IncomingMessage,
res: ServerResponse,
url: URL,
ctx: HostApiContext,
): Promise<boolean> {
if (url.pathname === '/api/diagnostics/gateway-snapshot' && req.method === 'GET') {
try {
const { channels } = await buildChannelAccountsView(ctx, { probe: false });
const diagnostics = ctx.gatewayManager.getDiagnostics?.() ?? {
consecutiveHeartbeatMisses: 0,
consecutiveRpcFailures: 0,
};
const channelStatusDiagnostics = getChannelStatusDiagnostics();
const gateway = {
...ctx.gatewayManager.getStatus(),
...buildGatewayHealthSummary({
status: ctx.gatewayManager.getStatus(),
diagnostics,
lastChannelsStatusOkAt: channelStatusDiagnostics.lastChannelsStatusOkAt,
lastChannelsStatusFailureAt: channelStatusDiagnostics.lastChannelsStatusFailureAt,
platform: process.platform,
}),
};
const openClawDir = getOpenClawConfigDir();
sendJson(res, 200, {
capturedAt: Date.now(),
platform: process.platform,
gateway,
channels,
clawxLogTail: await logger.readLogFile(DEFAULT_TAIL_LINES),
gatewayLogTail: await readTail(join(openClawDir, 'logs', 'gateway.log')),
gatewayErrLogTail: await readTail(join(openClawDir, 'logs', 'gateway.err.log')),
});
} catch (error) {
sendJson(res, 500, { success: false, error: String(error) });
}
return true;
}
return false;
}

View File

@@ -15,6 +15,7 @@ import { handleSkillRoutes } from './routes/skills';
import { handleFileRoutes } from './routes/files';
import { handleSessionRoutes } from './routes/sessions';
import { handleCronRoutes } from './routes/cron';
import { handleDiagnosticsRoutes } from './routes/diagnostics';
import { sendJson, setCorsHeaders, requireJsonContentType } from './route-utils';
type RouteHandler = (
@@ -35,6 +36,7 @@ const routeHandlers: RouteHandler[] = [
handleFileRoutes,
handleSessionRoutes,
handleCronRoutes,
handleDiagnosticsRoutes,
handleLogRoutes,
handleUsageRoutes,
];