Add channel health diagnostics and gateway recovery fixes (#855)

This commit is contained in:
Lingxuan Zuo
2026-04-15 13:51:02 +08:00
committed by GitHub
Unverified
parent 6acd8acf5a
commit 1f39d1a8a7
22 changed files with 1868 additions and 52 deletions

View File

@@ -62,6 +62,7 @@
"connectionStatus": {
"connected": "Connected",
"connecting": "Connecting",
"degraded": "Degraded",
"disconnected": "Disconnected",
"error": "Error"
},
@@ -99,6 +100,30 @@
"saveAndConnect": "Save & Connect",
"envVar": "Environment Variable: {{var}}"
},
"health": {
"state": {
"degraded": "Gateway degraded",
"unresponsive": "Gateway unresponsive"
},
"reasons": {
"gateway_degraded": "Gateway heartbeat recently degraded.",
"gateway_unresponsive": "Gateway control plane appears unresponsive.",
"channels_status_timeout": "Channel runtime status probe timed out.",
"rpc_timeout": "Recent Gateway RPC calls timed out.",
"gateway_not_running": "Gateway is not running.",
"gateway_error": "Gateway is in an error state.",
"runtime_error": "Channel runtime reported an error."
},
"restartGateway": "Restart Gateway",
"copyDiagnostics": "Copy Diagnostics",
"viewDiagnostics": "View Diagnostics",
"hideDiagnostics": "Hide Diagnostics",
"diagnosticsTitle": "Gateway Diagnostics Snapshot",
"diagnosticsCopied": "Diagnostics copied to clipboard",
"diagnosticsCopyFailed": "Failed to collect diagnostics: {{error}}",
"restartTriggered": "Gateway restart requested",
"restartFailed": "Failed to restart gateway: {{error}}"
},
"meta": {
"telegram": {
"description": "Connect Telegram using a bot token from @BotFather",

View File

@@ -62,6 +62,7 @@
"connectionStatus": {
"connected": "接続済み",
"connecting": "接続中",
"degraded": "劣化中",
"disconnected": "未接続",
"error": "異常"
},
@@ -99,6 +100,30 @@
"saveAndConnect": "保存して接続",
"envVar": "環境変数: {{var}}"
},
"health": {
"state": {
"degraded": "ゲートウェイ劣化",
"unresponsive": "ゲートウェイ無応答"
},
"reasons": {
"gateway_degraded": "ゲートウェイのハートビートに劣化が見られます。",
"gateway_unresponsive": "ゲートウェイの制御プレーンが無応答です。",
"channels_status_timeout": "チャンネル状態の問い合わせがタイムアウトしました。",
"rpc_timeout": "最近のゲートウェイ RPC がタイムアウトしました。",
"gateway_not_running": "ゲートウェイは起動していません。",
"gateway_error": "ゲートウェイはエラー状態です。",
"runtime_error": "チャンネルランタイムがエラーを返しました。"
},
"restartGateway": "ゲートウェイを再起動",
"copyDiagnostics": "診断をコピー",
"viewDiagnostics": "診断を表示",
"hideDiagnostics": "診断を隠す",
"diagnosticsTitle": "ゲートウェイ診断スナップショット",
"diagnosticsCopied": "診断をクリップボードにコピーしました",
"diagnosticsCopyFailed": "診断の取得に失敗しました: {{error}}",
"restartTriggered": "ゲートウェイの再起動を要求しました",
"restartFailed": "ゲートウェイの再起動に失敗しました: {{error}}"
},
"meta": {
"telegram": {
"description": "@BotFather からのボットトークンを使用して Telegram に接続します",

View File

@@ -62,6 +62,7 @@
"connectionStatus": {
"connected": "已连接",
"connecting": "连接中",
"degraded": "异常降级",
"disconnected": "未连接",
"error": "异常"
},
@@ -99,6 +100,30 @@
"saveAndConnect": "保存并连接",
"envVar": "环境变量: {{var}}"
},
"health": {
"state": {
"degraded": "网关状态异常",
"unresponsive": "网关无响应"
},
"reasons": {
"gateway_degraded": "网关心跳近期出现异常。",
"gateway_unresponsive": "网关控制面看起来已经无响应。",
"channels_status_timeout": "频道运行时状态探测超时。",
"rpc_timeout": "最近的网关 RPC 调用发生超时。",
"gateway_not_running": "网关当前未运行。",
"gateway_error": "网关当前处于错误状态。",
"runtime_error": "频道运行时返回了错误。"
},
"restartGateway": "重启网关",
"copyDiagnostics": "复制诊断快照",
"viewDiagnostics": "查看诊断快照",
"hideDiagnostics": "隐藏诊断快照",
"diagnosticsTitle": "网关诊断快照",
"diagnosticsCopied": "诊断快照已复制到剪贴板",
"diagnosticsCopyFailed": "收集诊断快照失败:{{error}}",
"restartTriggered": "已请求重启网关",
"restartFailed": "重启网关失败:{{error}}"
},
"meta": {
"telegram": {
"description": "使用 @BotFather 提供的机器人令牌连接 Telegram",

View File

@@ -1,4 +1,5 @@
export type ChannelConnectionStatus = 'connected' | 'connecting' | 'disconnected' | 'error';
export type GatewayHealthState = 'healthy' | 'degraded' | 'unresponsive';
export type ChannelConnectionStatus = 'connected' | 'connecting' | 'degraded' | 'disconnected' | 'error';
export interface ChannelRuntimeAccountSnapshot {
connected?: boolean;
@@ -19,6 +20,10 @@ export interface ChannelRuntimeSummarySnapshot {
lastError?: string | null;
}
export interface ChannelHealthOverlay {
gatewayHealthState?: GatewayHealthState;
}
const RECENT_ACTIVITY_MS = 10 * 60 * 1000;
function hasNonEmptyError(value: string | null | undefined): boolean {
@@ -74,9 +79,11 @@ export function isChannelRuntimeConnected(
export function computeChannelRuntimeStatus(
account: ChannelRuntimeAccountSnapshot,
healthOverlay?: ChannelHealthOverlay,
): ChannelConnectionStatus {
if (isChannelRuntimeConnected(account)) return 'connected';
if (hasChannelRuntimeError(account)) return 'error';
if (healthOverlay?.gatewayHealthState && healthOverlay.gatewayHealthState !== 'healthy') return 'degraded';
if (isChannelRuntimeConnected(account)) return 'connected';
if (account.running === true) return 'connecting';
return 'disconnected';
}
@@ -84,6 +91,7 @@ export function computeChannelRuntimeStatus(
export function pickChannelRuntimeStatus(
accounts: ChannelRuntimeAccountSnapshot[],
summary?: ChannelRuntimeSummarySnapshot,
healthOverlay?: ChannelHealthOverlay,
): ChannelConnectionStatus {
if (accounts.some((account) => isChannelRuntimeConnected(account))) {
return 'connected';
@@ -93,6 +101,10 @@ export function pickChannelRuntimeStatus(
return 'error';
}
if (healthOverlay?.gatewayHealthState && healthOverlay.gatewayHealthState !== 'healthy') {
return 'degraded';
}
if (accounts.some((account) => account.running === true)) {
return 'connecting';
}

View File

@@ -1,5 +1,5 @@
import { useState, useEffect, useCallback, useMemo, useRef } from 'react';
import { RefreshCw, Trash2, AlertCircle, Plus } from 'lucide-react';
import { RefreshCw, Trash2, AlertCircle, Plus, Copy, RotateCcw, ChevronDown, ChevronUp } from 'lucide-react';
import { Button } from '@/components/ui/button';
import { Badge } from '@/components/ui/badge';
import { ConfirmDialog } from '@/components/ui/confirm-dialog';
@@ -33,7 +33,8 @@ interface ChannelAccountItem {
accountId: string;
name: string;
configured: boolean;
status: 'connected' | 'connecting' | 'disconnected' | 'error';
status: 'connected' | 'connecting' | 'degraded' | 'disconnected' | 'error';
statusReason?: string;
lastError?: string;
isDefault: boolean;
agentId?: string;
@@ -42,10 +43,51 @@ interface ChannelAccountItem {
interface ChannelGroupItem {
channelType: string;
defaultAccountId: string;
status: 'connected' | 'connecting' | 'disconnected' | 'error';
status: 'connected' | 'connecting' | 'degraded' | 'disconnected' | 'error';
statusReason?: string;
accounts: ChannelAccountItem[];
}
interface GatewayHealthSummary {
state: 'healthy' | 'degraded' | 'unresponsive';
reasons: string[];
consecutiveHeartbeatMisses: number;
lastAliveAt?: number;
lastRpcSuccessAt?: number;
lastRpcFailureAt?: number;
lastRpcFailureMethod?: string;
lastChannelsStatusOkAt?: number;
lastChannelsStatusFailureAt?: number;
}
interface GatewayDiagnosticSnapshot {
capturedAt: number;
platform: string;
gateway: GatewayHealthSummary & Record<string, unknown>;
channels: ChannelGroupItem[];
clawxLogTail: string;
gatewayLogTail: string;
gatewayErrLogTail: string;
}
function isGatewayDiagnosticSnapshot(value: unknown): value is GatewayDiagnosticSnapshot {
if (!value || typeof value !== 'object') {
return false;
}
const snapshot = value as Record<string, unknown>;
return (
typeof snapshot.capturedAt === 'number'
&& typeof snapshot.platform === 'string'
&& typeof snapshot.gateway === 'object'
&& snapshot.gateway !== null
&& Array.isArray(snapshot.channels)
&& typeof snapshot.clawxLogTail === 'string'
&& typeof snapshot.gatewayLogTail === 'string'
&& typeof snapshot.gatewayErrLogTail === 'string'
);
}
interface AgentItem {
id: string;
name: string;
@@ -76,11 +118,20 @@ export function Channels() {
const { t } = useTranslation('channels');
const gatewayStatus = useGatewayStore((state) => state.status);
const lastGatewayStateRef = useRef(gatewayStatus.state);
const defaultGatewayHealth = useMemo<GatewayHealthSummary>(() => ({
state: 'healthy',
reasons: [],
consecutiveHeartbeatMisses: 0,
}), []);
const [loading, setLoading] = useState(true);
const [error, setError] = useState<string | null>(null);
const [channelGroups, setChannelGroups] = useState<ChannelGroupItem[]>([]);
const [agents, setAgents] = useState<AgentItem[]>([]);
const [gatewayHealth, setGatewayHealth] = useState<GatewayHealthSummary>(defaultGatewayHealth);
const [diagnosticsSnapshot, setDiagnosticsSnapshot] = useState<GatewayDiagnosticSnapshot | null>(null);
const [showDiagnostics, setShowDiagnostics] = useState(false);
const [diagnosticsLoading, setDiagnosticsLoading] = useState(false);
const [showConfigModal, setShowConfigModal] = useState(false);
const [selectedChannelType, setSelectedChannelType] = useState<ChannelType | null>(null);
const [selectedAccountId, setSelectedAccountId] = useState<string | undefined>(undefined);
@@ -139,18 +190,29 @@ export function Channels() {
hostApiFetch<{ success: boolean; agents?: AgentItem[]; error?: string }>('/api/agents'),
]);
if (!channelsRes.success) {
throw new Error(channelsRes.error || 'Failed to load channels');
type ChannelsResponse = {
success: boolean;
channels?: ChannelGroupItem[];
gatewayHealth?: GatewayHealthSummary;
error?: string;
};
const channelsPayload = channelsRes as ChannelsResponse;
if (!channelsPayload.success) {
throw new Error(channelsPayload.error || 'Failed to load channels');
}
if (!agentsRes.success) {
throw new Error(agentsRes.error || 'Failed to load agents');
}
setChannelGroups(channelsRes.channels || []);
setChannelGroups(channelsPayload.channels || []);
setAgents(agentsRes.agents || []);
setGatewayHealth(channelsPayload.gatewayHealth || defaultGatewayHealth);
setDiagnosticsSnapshot(null);
setShowDiagnostics(false);
console.info(
`[channels-ui] fetch ok probe=${probe ? '1' : '0'} elapsedMs=${Date.now() - startedAt} view=${(channelsRes.channels || []).map((item) => `${item.channelType}:${item.status}`).join(',')}`
`[channels-ui] fetch ok probe=${probe ? '1' : '0'} elapsedMs=${Date.now() - startedAt} view=${(channelsPayload.channels || []).map((item) => `${item.channelType}:${item.status}`).join(',')}`
);
} catch (fetchError) {
// Preserve previous data on error — don't clear channelGroups/agents.
@@ -269,6 +331,100 @@ export function Channels() {
void fetchPageData({ probe: true });
};
const fetchDiagnosticsSnapshot = useCallback(async (): Promise<GatewayDiagnosticSnapshot> => {
const response = await hostApiFetch<unknown>('/api/diagnostics/gateway-snapshot');
if (response && typeof response === 'object') {
const payload = response as Record<string, unknown>;
if (payload.success === false || typeof payload.error === 'string') {
throw new Error(typeof payload.error === 'string' ? payload.error : 'Failed to fetch gateway diagnostics snapshot');
}
}
if (!isGatewayDiagnosticSnapshot(response)) {
throw new Error('Invalid gateway diagnostics snapshot response');
}
const snapshot = response;
setDiagnosticsSnapshot(snapshot);
return snapshot;
}, []);
const handleRestartGateway = async () => {
try {
const result = await hostApiFetch<{ success?: boolean; error?: string }>('/api/gateway/restart', {
method: 'POST',
});
if (result?.success !== true) {
throw new Error(result?.error || 'Failed to restart gateway');
}
setDiagnosticsSnapshot(null);
setShowDiagnostics(false);
toast.success(t('health.restartTriggered'));
void fetchPageData({ probe: true });
} catch (restartError) {
toast.error(t('health.restartFailed', { error: String(restartError) }));
}
};
const handleCopyDiagnostics = async () => {
setDiagnosticsLoading(true);
try {
const snapshot = await fetchDiagnosticsSnapshot();
await navigator.clipboard.writeText(JSON.stringify(snapshot, null, 2));
toast.success(t('health.diagnosticsCopied'));
} catch (copyError) {
toast.error(t('health.diagnosticsCopyFailed', { error: String(copyError) }));
} finally {
setDiagnosticsLoading(false);
}
};
const handleToggleDiagnostics = async () => {
if (showDiagnostics) {
setShowDiagnostics(false);
return;
}
setDiagnosticsLoading(true);
try {
await fetchDiagnosticsSnapshot();
} catch (diagnosticsError) {
toast.error(t('health.diagnosticsCopyFailed', { error: String(diagnosticsError) }));
setDiagnosticsLoading(false);
return;
} finally {
setDiagnosticsLoading(false);
}
setShowDiagnostics(true);
};
const healthReasonLabel = useMemo(() => {
const primaryReason = gatewayHealth.reasons[0];
if (!primaryReason) return '';
return t(`health.reasons.${primaryReason}`);
}, [gatewayHealth.reasons, t]);
const diagnosticsText = useMemo(
() => diagnosticsSnapshot ? JSON.stringify(diagnosticsSnapshot, null, 2) : '',
[diagnosticsSnapshot],
);
const statusTone = useCallback((status: ChannelGroupItem['status']) => {
switch (status) {
case 'connected':
return 'bg-green-500/10 text-green-700 dark:text-green-300 border-green-500/20';
case 'connecting':
return 'bg-sky-500/10 text-sky-700 dark:text-sky-300 border-sky-500/20';
case 'degraded':
return 'bg-yellow-500/10 text-yellow-700 dark:text-yellow-300 border-yellow-500/20';
case 'error':
return 'bg-destructive/10 text-destructive border-destructive/20';
default:
return 'bg-black/5 dark:bg-white/5 text-muted-foreground border-black/10 dark:border-white/10';
}
}, []);
const statusLabel = useCallback((status: ChannelGroupItem['status']) => {
return t(`account.connectionStatus.${status}`);
}, [t]);
const handleBindAgent = async (channelType: string, accountId: string, agentId: string) => {
try {
if (!agentId) {
@@ -365,6 +521,86 @@ export function Channels() {
</div>
)}
{gatewayStatus.state === 'running' && gatewayHealth.state !== 'healthy' && (
<div
data-testid="channels-health-banner"
className={cn(
'mb-8 rounded-xl border p-4',
gatewayHealth.state === 'unresponsive'
? 'border-destructive/50 bg-destructive/10'
: 'border-yellow-500/50 bg-yellow-500/10',
)}
>
<div className="flex flex-col gap-4 md:flex-row md:items-start md:justify-between">
<div className="flex items-start gap-3">
<AlertCircle
className={cn(
'mt-0.5 h-5 w-5 shrink-0',
gatewayHealth.state === 'unresponsive'
? 'text-destructive'
: 'text-yellow-600 dark:text-yellow-400',
)}
/>
<div>
<p className="text-sm font-semibold text-foreground">
{t(`health.state.${gatewayHealth.state}`)}
</p>
{healthReasonLabel && (
<p className="mt-1 text-sm text-foreground/75">{healthReasonLabel}</p>
)}
</div>
</div>
<div className="flex flex-wrap items-center gap-2">
<Button
data-testid="channels-restart-gateway"
size="sm"
variant="outline"
className="h-8 rounded-full text-xs"
onClick={() => { void handleRestartGateway(); }}
>
<RotateCcw className="mr-2 h-3.5 w-3.5" />
{t('health.restartGateway')}
</Button>
<Button
data-testid="channels-copy-diagnostics"
size="sm"
variant="outline"
className="h-8 rounded-full text-xs"
disabled={diagnosticsLoading}
onClick={() => { void handleCopyDiagnostics(); }}
>
<Copy className="mr-2 h-3.5 w-3.5" />
{t('health.copyDiagnostics')}
</Button>
<Button
data-testid="channels-toggle-diagnostics"
size="sm"
variant="outline"
className="h-8 rounded-full text-xs"
disabled={diagnosticsLoading}
onClick={() => { void handleToggleDiagnostics(); }}
>
{showDiagnostics ? (
<ChevronUp className="mr-2 h-3.5 w-3.5" />
) : (
<ChevronDown className="mr-2 h-3.5 w-3.5" />
)}
{showDiagnostics ? t('health.hideDiagnostics') : t('health.viewDiagnostics')}
</Button>
</div>
</div>
{showDiagnostics && diagnosticsText && (
<div className="mt-4 rounded-xl border border-black/10 dark:border-white/10 bg-background/80 p-3">
<p className="mb-2 text-xs font-medium text-muted-foreground">{t('health.diagnosticsTitle')}</p>
<pre data-testid="channels-diagnostics" className="max-h-[320px] overflow-auto whitespace-pre-wrap break-all text-[11px] text-foreground/85">
{diagnosticsText}
</pre>
</div>
)}
</div>
)}
{error && (
<div className="mb-8 p-4 rounded-xl border border-destructive/50 bg-destructive/10 flex items-center gap-3">
<AlertCircle className="h-5 w-5 text-destructive" />
@@ -393,18 +629,9 @@ export function Channels() {
</h3>
<p className="text-[12px] text-muted-foreground">{group.channelType}</p>
</div>
<div
className={cn(
'w-2 h-2 rounded-full shrink-0',
group.status === 'connected'
? 'bg-green-500'
: group.status === 'connecting'
? 'bg-yellow-500 animate-pulse'
: group.status === 'error'
? 'bg-destructive'
: 'bg-muted-foreground'
)}
/>
<Badge className={cn('rounded-full border px-2.5 py-0.5 text-[11px] font-medium', statusTone(group.status))}>
{statusLabel(group.status)}
</Badge>
</div>
<div className="flex items-center gap-2">
@@ -456,10 +683,18 @@ export function Channels() {
<div className="min-w-0">
<div className="flex items-center gap-2">
<p className="text-[13px] font-medium text-foreground truncate">{displayName}</p>
<Badge className={cn('rounded-full border px-2 py-0.5 text-[10px] font-medium', statusTone(account.status))}>
{statusLabel(account.status)}
</Badge>
</div>
{account.lastError && (
<div className="text-[12px] text-destructive mt-1">{account.lastError}</div>
)}
{!account.lastError && account.statusReason && account.status === 'degraded' && (
<div className="text-[12px] text-yellow-700 dark:text-yellow-300 mt-1">
{t(`health.reasons.${account.statusReason}`)}
</div>
)}
</div>
<div className="flex items-center gap-2">

View File

@@ -26,7 +26,7 @@ export type ChannelType =
/**
* Channel connection status
*/
export type ChannelStatus = 'connected' | 'disconnected' | 'connecting' | 'error';
export type ChannelStatus = 'connected' | 'disconnected' | 'connecting' | 'degraded' | 'error';
/**
* Channel connection type