Fix agent to channel (#485)

2026-03-14 14:17:42 +08:00
parent f6de56fa78
commit 9f2bc3cf68
4 changed files with 168 additions and 18 deletions
--- a/electron/api/routes/agents.ts
+++ b/electron/api/routes/agents.ts
@@ -5,10 +5,12 @@ import {
  createAgent,
  deleteAgentConfig,
  listAgentsSnapshot,
+  removeAgentWorkspaceDirectory,
  resolveAccountIdForAgent,
  updateAgentName,
 } from '../../utils/agent-config';
 import { deleteChannelAccountConfig } from '../../utils/channel-config';
+import { syncAllProviderAuthToRuntime } from '../../services/providers/provider-runtime-sync';
 import type { HostApiContext } from '../context';
 import { parseJsonBody, sendJson } from '../route-utils';

@@ -20,6 +22,84 @@ function scheduleGatewayReload(ctx: HostApiContext, reason: string): void {
  void reason;
 }

+import { exec } from 'child_process';
+import { promisify } from 'util';
+const execAsync = promisify(exec);
+
+/**
+ * Force a full Gateway process restart after agent deletion.
+ *
+ * A SIGUSR1 in-process reload is NOT sufficient here: channel plugins
+ * (e.g. Feishu) maintain long-lived WebSocket connections to external
+ * services and do not disconnect accounts that were removed from the
+ * config during an in-process reload.  The only reliable way to drop
+ * stale bot connections is to kill the Gateway process entirely and
+ * spawn a fresh one that reads the updated openclaw.json from scratch.
+ */
+async function restartGatewayForAgentDeletion(ctx: HostApiContext): Promise<void> {
+  try {
+    // Capture the PID of the running Gateway BEFORE stop() clears it.
+    const status = ctx.gatewayManager.getStatus();
+    const pid = status.pid;
+    const port = status.port;
+    console.log('[agents] Triggering Gateway restart (kill+respawn) after agent deletion', { pid, port });
+
+    // Force-kill the Gateway process by PID.  The manager's stop() only
+    // kills "owned" processes; if the manager connected to an already-
+    // running Gateway (ownsProcess=false), stop() simply closes the WS
+    // and the old process stays alive with its stale channel connections.
+    if (pid) {
+      try {
+        process.kill(pid, 'SIGTERM');
+        // Give it a moment to die
+        await new Promise((resolve) => setTimeout(resolve, 500));
+        try { process.kill(pid, 0); process.kill(pid, 'SIGKILL'); } catch { /* already dead */ }
+      } catch {
+        // process already gone – that's fine
+      }
+    } else if (port) {
+      // If we don't know the PID (e.g. connected to an orphaned Gateway from
+      // a previous pnpm dev run), forcefully kill whatever is on the port.
+      try {
+        if (process.platform === 'darwin' || process.platform === 'linux') {
+          // MUST use -sTCP:LISTEN. Otherwise lsof returns the client process (ClawX itself) 
+          // that has an ESTABLISHED WebSocket connection to the port, causing us to kill ourselves.
+          const { stdout } = await execAsync(`lsof -t -i :${port} -sTCP:LISTEN`);
+          const pids = stdout.trim().split('\n').filter(Boolean);
+          for (const p of pids) {
+            try { process.kill(parseInt(p, 10), 'SIGTERM'); } catch { /* ignore */ }
+          }
+          await new Promise((resolve) => setTimeout(resolve, 500));
+          for (const p of pids) {
+            try { process.kill(parseInt(p, 10), 'SIGKILL'); } catch { /* ignore */ }
+          }
+        } else if (process.platform === 'win32') {
+          // Find PID listening on the port
+          const { stdout } = await execAsync(`netstat -ano | findstr :${port}`);
+          const lines = stdout.trim().split('\n');
+          const pids = new Set<string>();
+          for (const line of lines) {
+            const parts = line.trim().split(/\s+/);
+            if (parts.length >= 5 && parts[1].endsWith(`:${port}`) && parts[3] === 'LISTENING') {
+              pids.add(parts[4]);
+            }
+          }
+          for (const p of pids) {
+            try { await execAsync(`taskkill /F /PID ${p}`); } catch { /* ignore */ }
+          }
+        }
+      } catch {
+        // Port might not be bound or command failed; ignore
+      }
+    }
+
+    await ctx.gatewayManager.restart();
+    console.log('[agents] Gateway restart completed after agent deletion');
+  } catch (err) {
+    console.warn('[agents] Gateway restart after agent deletion failed:', err);
+  }
+}
+
 export async function handleAgentRoutes(
  req: IncomingMessage,
  res: ServerResponse,
@@ -35,6 +115,13 @@ export async function handleAgentRoutes(
    try {
      const body = await parseJsonBody<{ name: string }>(req);
      const snapshot = await createAgent(body.name);
+      // Sync provider API keys to the new agent's auth-profiles.json so the
+      // embedded runner can authenticate with LLM providers when messages
+      // arrive via channel bots (e.g. Feishu). Without this, the copied
+      // auth-profiles.json may contain a stale key → 401 from the LLM.
+      syncAllProviderAuthToRuntime().catch((err) => {
+        console.warn('[agents] Failed to sync provider auth after agent creation:', err);
+      });
      scheduleGatewayReload(ctx, 'create-agent');
      sendJson(res, 200, { success: true, ...snapshot });
    } catch (error) {
@@ -81,8 +168,15 @@ export async function handleAgentRoutes(
    if (parts.length === 1) {
      try {
        const agentId = decodeURIComponent(parts[0]);
-        const snapshot = await deleteAgentConfig(agentId);
-        scheduleGatewayReload(ctx, 'delete-agent');
+        const { snapshot, removedEntry } = await deleteAgentConfig(agentId);
+        // Await reload synchronously BEFORE responding to the client.
+        // This ensures the Feishu plugin has disconnected the deleted bot
+        // before the UI shows "delete success" and the user tries chatting.
+        await restartGatewayForAgentDeletion(ctx);
+        // Delete workspace after reload so the new config is already live.
+        await removeAgentWorkspaceDirectory(removedEntry).catch((err) => {
+          console.warn('[agents] Failed to remove workspace after agent deletion:', err);
+        });
        sendJson(res, 200, { success: true, ...snapshot });
      } catch (error) {
        sendJson(res, 500, { success: false, error: String(error) });
--- a/electron/utils/agent-config.ts
+++ b/electron/utils/agent-config.ts
@@ -335,8 +335,8 @@ function getManagedWorkspaceDirectory(agent: AgentListEntry): string | null {
  return normalizedConfigured === normalizedManaged ? configuredWorkspace : null;
 }

-async function removeAgentWorkspaceDirectory(agent: AgentListEntry): Promise<void> {
-  const workspaceDir = getManagedWorkspaceDirectory(agent);
+export async function removeAgentWorkspaceDirectory(agent: { id: string; workspace?: string }): Promise<void> {
+  const workspaceDir = getManagedWorkspaceDirectory(agent as AgentListEntry);
  if (!workspaceDir) {
    logger.warn('Skipping agent workspace deletion for unmanaged path', {
      agentId: agent.id,
@@ -567,7 +567,7 @@ export async function updateAgentName(agentId: string, name: string): Promise<Ag
  });
 }

-export async function deleteAgentConfig(agentId: string): Promise<AgentsSnapshot> {
+export async function deleteAgentConfig(agentId: string): Promise<{ snapshot: AgentsSnapshot; removedEntry: AgentListEntry }> {
  return withConfigLock(async () => {
    if (agentId === MAIN_AGENT_ID) {
      throw new Error('The main agent cannot be deleted');
@@ -599,9 +599,14 @@ export async function deleteAgentConfig(agentId: string): Promise<AgentsSnapshot
    await writeOpenClawConfig(config);
    await deleteAgentChannelAccounts(agentId);
    await removeAgentRuntimeDirectory(agentId);
-    await removeAgentWorkspaceDirectory(removedEntry);
+    // NOTE: workspace directory is NOT deleted here intentionally.
+    // The caller (route handler) defers workspace removal until after
+    // the Gateway process has fully restarted, so that any in-flight
+    // process.chdir(workspace) calls complete before the directory
+    // disappears (otherwise process.cwd() throws ENOENT for the rest
+    // of the Gateway's lifetime).
    logger.info('Deleted agent config entry', { agentId });
-    return buildSnapshotFromConfig(config);
+    return { snapshot: await buildSnapshotFromConfig(config), removedEntry };
  });
 }

--- a/electron/utils/channel-config.ts
+++ b/electron/utils/channel-config.ts
@@ -24,6 +24,25 @@ const CHANNEL_TOP_LEVEL_KEYS_TO_KEEP = new Set(['accounts', 'defaultAccount', 'e
 // Channels that are managed as plugins (config goes under plugins.entries, not channels)
 const PLUGIN_CHANNELS = ['whatsapp'];

+// Unique credential key per channel type – used for duplicate bot detection.
+// Maps each channel type to the field that uniquely identifies a bot/account.
+// When two agents try to use the same value for this field, the save is rejected.
+const CHANNEL_UNIQUE_CREDENTIAL_KEY: Record<string, string> = {
+    feishu: 'appId',
+    wecom: 'botId',
+    dingtalk: 'clientId',
+    telegram: 'botToken',
+    discord: 'token',
+    qqbot: 'appId',
+    signal: 'phoneNumber',
+    imessage: 'serverUrl',
+    matrix: 'accessToken',
+    line: 'channelAccessToken',
+    msteams: 'appId',
+    googlechat: 'serviceAccountKey',
+    mattermost: 'botToken',
+};
+
 // ── Helpers ──────────────────────────────────────────────────────

 async function fileExists(p: string): Promise<boolean> {
@@ -316,6 +335,39 @@ function migrateLegacyChannelConfigToAccounts(
    }
 }

+/**
+ * Throws if the unique credential (e.g. appId for Feishu) in `config` is
+ * already registered under a *different* account in the same channel section.
+ * This prevents two agents from silently sharing the same bot connection.
+ */
+function assertNoDuplicateCredential(
+    channelType: string,
+    config: ChannelConfigData,
+    channelSection: ChannelConfigData,
+    resolvedAccountId: string,
+): void {
+    const uniqueKey = CHANNEL_UNIQUE_CREDENTIAL_KEY[channelType];
+    if (!uniqueKey) return;
+
+    const incomingValue = config[uniqueKey];
+    if (!incomingValue || typeof incomingValue !== 'string') return;
+
+    const accounts = channelSection.accounts as Record<string, ChannelConfigData> | undefined;
+    if (!accounts) return;
+
+    for (const [existingAccountId, accountCfg] of Object.entries(accounts)) {
+        if (existingAccountId === resolvedAccountId) continue;
+        if (!accountCfg || typeof accountCfg !== 'object') continue;
+        const existingValue = accountCfg[uniqueKey];
+        if (typeof existingValue === 'string' && existingValue === incomingValue) {
+            throw new Error(
+                `The ${channelType} bot (${uniqueKey}: ${incomingValue}) is already bound to another agent (account: ${existingAccountId}). ` +
+                `Each agent must use a unique bot.`,
+            );
+        }
+    }
+}
+
 export async function saveChannelConfig(
    channelType: string,
    config: ChannelConfigData,
@@ -358,6 +410,10 @@ export async function saveChannelConfig(

        const channelSection = currentConfig.channels[channelType];
        migrateLegacyChannelConfigToAccounts(channelSection, DEFAULT_ACCOUNT_ID);
+
+        // Guard: reject if this bot/app credential is already used by another account.
+        assertNoDuplicateCredential(channelType, config, channelSection, resolvedAccountId);
+
        const existingAccountConfig = resolveAccountConfig(channelSection, resolvedAccountId);
        const transformedConfig = transformChannelConfig(channelType, config, existingAccountConfig);

--- a/electron/utils/openclaw-workspace.ts
+++ b/electron/utils/openclaw-workspace.ts
@@ -5,7 +5,7 @@
 * main thread.
 */
 import { access, readFile, writeFile, readdir, mkdir, unlink } from 'fs/promises';
-import { constants, Dirent } from 'fs';
+import { constants } from 'fs';
 import { join } from 'path';
 import { homedir } from 'os';
 import { logger } from './logger';
@@ -78,16 +78,11 @@ async function resolveAllWorkspaceDirs(): Promise<string[]> {
    // ignore config parse errors
  }

-  try {
-    const entries: Dirent[] = await readdir(openclawDir, { withFileTypes: true });
-    for (const entry of entries) {
-      if (entry.isDirectory() && entry.name.startsWith('workspace')) {
-        dirs.add(join(openclawDir, entry.name));
-      }
-    }
-  } catch {
-    // ignore read errors
-  }
+  // We intentionally do NOT scan ~/.openclaw/ for any directory starting
+  // with 'workspace'. Doing so causes a race condition where a recently deleted
+  // agent's workspace (e.g., workspace-code23) is found and resuscitated by
+  // the context merge routine before its deletion finishes. Only workspaces
+  // explicitly declared in openclaw.json should be seeded.

  if (dirs.size === 0) {
    dirs.add(join(openclawDir, 'workspace'));