From 730d5466dd83e283577486956155645387b5f88c Mon Sep 17 00:00:00 2001 From: Haze <709547807@qq.com> Date: Sun, 1 Mar 2026 17:42:25 +0800 Subject: [PATCH] fix(gateway): optimize gateway start robustness in version 0.1.19 (#244) Co-authored-by: Cursor Agent Co-authored-by: Haze --- electron/gateway/manager.ts | 232 ++++++++++++++++---- electron/gateway/startup-recovery.ts | 60 +++++ electron/utils/openclaw-auth.ts | 47 ++++ package.json | 2 +- tests/unit/gateway-startup-recovery.test.ts | 52 +++++ tests/unit/sanitize-config.test.ts | 226 +++++++++++++++++++ 6 files changed, 579 insertions(+), 40 deletions(-) create mode 100644 electron/gateway/startup-recovery.ts create mode 100644 tests/unit/gateway-startup-recovery.test.ts create mode 100644 tests/unit/sanitize-config.test.ts diff --git a/electron/gateway/manager.ts b/electron/gateway/manager.ts index 77f85338a..b79de96b9 100644 --- a/electron/gateway/manager.ts +++ b/electron/gateway/manager.ts @@ -31,7 +31,8 @@ import { buildDeviceAuthPayload, type DeviceIdentity, } from '../utils/device-identity'; -import { syncGatewayTokenToConfig, syncBrowserConfigToOpenClaw } from '../utils/openclaw-auth'; +import { syncGatewayTokenToConfig, syncBrowserConfigToOpenClaw, sanitizeOpenClawConfig } from '../utils/openclaw-auth'; +import { shouldAttemptConfigAutoRepair } from './startup-recovery'; /** * Gateway connection status @@ -176,6 +177,7 @@ export class GatewayManager extends EventEmitter { private shouldReconnect = true; private startLock = false; private lastSpawnSummary: string | null = null; + private recentStartupStderrLines: string[] = []; private pendingRequests: Map void; reject: (error: Error) => void; @@ -232,6 +234,16 @@ export class GatewayManager extends EventEmitter { return { level: 'warn', normalized: msg }; } + + private recordStartupStderrLine(line: string): void { + const normalized = line.trim(); + if (!normalized) return; + this.recentStartupStderrLines.push(normalized); + const MAX_STDERR_LINES = 120; + if (this.recentStartupStderrLines.length > MAX_STDERR_LINES) { + this.recentStartupStderrLines.splice(0, this.recentStartupStderrLines.length - MAX_STDERR_LINES); + } + } /** * Get current Gateway status @@ -279,49 +291,70 @@ export class GatewayManager extends EventEmitter { this.reconnectAttempts = 0; this.setStatus({ state: 'starting', reconnectAttempts: 0 }); + let configRepairAttempted = false; + + // Check if Python environment is ready (self-healing) asynchronously. + // Fire-and-forget: only needs to run once, not on every retry. + void isPythonReady().then(pythonReady => { + if (!pythonReady) { + logger.info('Python environment missing or incomplete, attempting background repair...'); + void setupManagedPython().catch(err => { + logger.error('Background Python repair failed:', err); + }); + } + }).catch(err => { + logger.error('Failed to check Python environment:', err); + }); try { - // Check if Python environment is ready (self-healing) asynchronously - void isPythonReady().then(pythonReady => { - if (!pythonReady) { - logger.info('Python environment missing or incomplete, attempting background repair...'); - // We don't await this to avoid blocking Gateway startup, - // as uv run will handle it if needed, but this pre-warms it. - void setupManagedPython().catch(err => { - logger.error('Background Python repair failed:', err); - }); + while (true) { + this.recentStartupStderrLines = []; + try { + // Check if Gateway is already running + logger.debug('Checking for existing Gateway...'); + const existing = await this.findExistingGateway(); + if (existing) { + logger.debug(`Found existing Gateway on port ${existing.port}`); + await this.connect(existing.port, existing.externalToken); + this.ownsProcess = false; + this.setStatus({ pid: undefined }); + this.startHealthCheck(); + return; + } + + logger.debug('No existing Gateway found, starting new process...'); + + // Start new Gateway process + await this.startProcess(); + + // Wait for Gateway to be ready + await this.waitForReady(); + + // Connect WebSocket + await this.connect(this.status.port); + + // Start health monitoring + this.startHealthCheck(); + logger.debug('Gateway started successfully'); + return; + } catch (error) { + if (shouldAttemptConfigAutoRepair(error, this.recentStartupStderrLines, configRepairAttempted)) { + configRepairAttempted = true; + logger.warn( + 'Detected invalid OpenClaw config during Gateway startup; running doctor repair before retry' + ); + const repaired = await this.runOpenClawDoctorRepair(); + if (repaired) { + logger.info('OpenClaw doctor repair completed; retrying Gateway startup'); + this.setStatus({ state: 'starting', error: undefined, reconnectAttempts: 0 }); + continue; + } + logger.error('OpenClaw doctor repair failed; not retrying Gateway startup'); + } + throw error; } - }).catch(err => { - logger.error('Failed to check Python environment:', err); - }); - - // Check if Gateway is already running - logger.debug('Checking for existing Gateway...'); - const existing = await this.findExistingGateway(); - if (existing) { - logger.debug(`Found existing Gateway on port ${existing.port}`); - await this.connect(existing.port, existing.externalToken); - this.ownsProcess = false; - this.setStatus({ pid: undefined }); - this.startHealthCheck(); - return; } - logger.debug('No existing Gateway found, starting new process...'); - - // Start new Gateway process - await this.startProcess(); - - // Wait for Gateway to be ready - await this.waitForReady(); - - // Connect WebSocket - await this.connect(this.status.port); - - // Start health monitoring - this.startHealthCheck(); - logger.debug('Gateway started successfully'); - } catch (error) { logger.error( `Gateway start failed (port=${this.status.port}, reconnectAttempts=${this.reconnectAttempts}, spawn=${this.lastSpawnSummary ?? 'n/a'})`, @@ -709,6 +742,115 @@ export class GatewayManager extends EventEmitter { return null; } + + /** + * Attempt to repair invalid OpenClaw config using the built-in doctor command. + * Returns true when doctor exits successfully. + */ + private async runOpenClawDoctorRepair(): Promise { + const openclawDir = getOpenClawDir(); + const entryScript = getOpenClawEntryPath(); + if (!existsSync(entryScript)) { + logger.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`); + return false; + } + + const platform = process.platform; + const arch = process.arch; + const target = `${platform}-${arch}`; + const binPath = app.isPackaged + ? path.join(process.resourcesPath, 'bin') + : path.join(process.cwd(), 'resources', 'bin', target); + const binPathExists = existsSync(binPath); + const finalPath = binPathExists + ? `${binPath}${path.delimiter}${process.env.PATH || ''}` + : process.env.PATH || ''; + + const uvEnv = await getUvMirrorEnv(); + const command = app.isPackaged ? getNodeExecutablePath() : 'node'; + const args = [entryScript, 'doctor', '--fix', '--yes', '--non-interactive']; + logger.info( + `Running OpenClaw doctor repair (command="${command}", args="${args.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})` + ); + + return new Promise((resolve) => { + const spawnEnv: Record = { + ...process.env, + PATH: finalPath, + ...uvEnv, + }; + + if (app.isPackaged) { + spawnEnv['ELECTRON_RUN_AS_NODE'] = '1'; + spawnEnv['OPENCLAW_NO_RESPAWN'] = '1'; + const existingNodeOpts = spawnEnv['NODE_OPTIONS'] ?? ''; + if (!existingNodeOpts.includes('--disable-warning=ExperimentalWarning') && + !existingNodeOpts.includes('--no-warnings')) { + spawnEnv['NODE_OPTIONS'] = `${existingNodeOpts} --disable-warning=ExperimentalWarning`.trim(); + } + } + + const child = spawn(command, args, { + cwd: openclawDir, + stdio: ['ignore', 'pipe', 'pipe'], + detached: false, + shell: false, + env: spawnEnv, + }); + + let settled = false; + const finish = (ok: boolean) => { + if (settled) return; + settled = true; + resolve(ok); + }; + + const timeout = setTimeout(() => { + logger.error('OpenClaw doctor repair timed out after 120000ms'); + try { + child.kill('SIGTERM'); + } catch { + // ignore + } + finish(false); + }, 120000); + + child.on('error', (err) => { + clearTimeout(timeout); + logger.error('Failed to spawn OpenClaw doctor repair process:', err); + finish(false); + }); + + child.stdout?.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const normalized = line.trim(); + if (!normalized) continue; + logger.debug(`[Gateway doctor stdout] ${normalized}`); + } + }); + + child.stderr?.on('data', (data) => { + const raw = data.toString(); + for (const line of raw.split(/\r?\n/)) { + const normalized = line.trim(); + if (!normalized) continue; + logger.warn(`[Gateway doctor stderr] ${normalized}`); + } + }); + + child.on('exit', (code, signal) => { + clearTimeout(timeout); + if (code === 0) { + logger.info('OpenClaw doctor repair completed successfully'); + finish(true); + return; + } + logger.warn(`OpenClaw doctor repair exited (${this.formatExit(code, signal)})`); + finish(false); + }); + }); + } /** * Start Gateway process @@ -731,6 +873,17 @@ export class GatewayManager extends EventEmitter { // Get or generate gateway token const gatewayToken = await getSetting('gatewayToken'); + // Strip stale/invalid keys from openclaw.json that would cause the + // Gateway's strict config validation to reject the file on startup + // (e.g. `skills.enabled` left by an older version). + // This is a fast file-based pre-check; the reactive auto-repair + // mechanism (runOpenClawDoctorRepair) handles any remaining issues. + try { + await sanitizeOpenClawConfig(); + } catch (err) { + logger.warn('Failed to sanitize openclaw.json:', err); + } + // Write our token into openclaw.json before starting the process. // Without --dev the gateway authenticates using the token in // openclaw.json; if that file has a stale token (e.g. left by the @@ -924,6 +1077,7 @@ export class GatewayManager extends EventEmitter { child.stderr?.on('data', (data) => { const raw = data.toString(); for (const line of raw.split(/\r?\n/)) { + this.recordStartupStderrLine(line); const classified = this.classifyStderrMessage(line); if (classified.level === 'drop') continue; if (classified.level === 'debug') { diff --git a/electron/gateway/startup-recovery.ts b/electron/gateway/startup-recovery.ts new file mode 100644 index 000000000..c630a51b1 --- /dev/null +++ b/electron/gateway/startup-recovery.ts @@ -0,0 +1,60 @@ +/** + * Gateway startup recovery heuristics. + * + * This module is intentionally dependency-free so it can be unit-tested + * without Electron/runtime mocks. + */ + +const INVALID_CONFIG_PATTERNS: RegExp[] = [ + /\binvalid config\b/i, + /\bconfig invalid\b/i, + /\bunrecognized key\b/i, + /\brun:\s*openclaw doctor --fix\b/i, +]; + +function normalizeLogLine(value: string): string { + return value.trim(); +} + +/** + * Returns true when text appears to indicate OpenClaw config validation failure. + */ +export function isInvalidConfigSignal(text: string): boolean { + const normalized = normalizeLogLine(text); + if (!normalized) return false; + return INVALID_CONFIG_PATTERNS.some((pattern) => pattern.test(normalized)); +} + +/** + * Returns true when either startup stderr lines or startup error message + * indicate an OpenClaw config validation failure. + */ +export function hasInvalidConfigFailureSignal( + startupError: unknown, + startupStderrLines: string[], +): boolean { + for (const line of startupStderrLines) { + if (isInvalidConfigSignal(line)) { + return true; + } + } + + const errorText = startupError instanceof Error + ? `${startupError.name}: ${startupError.message}` + : String(startupError ?? ''); + + return isInvalidConfigSignal(errorText); +} + +/** + * Retry guard for one-time config repair during a single startup flow. + */ +export function shouldAttemptConfigAutoRepair( + startupError: unknown, + startupStderrLines: string[], + alreadyAttempted: boolean, +): boolean { + if (alreadyAttempted) return false; + return hasInvalidConfigFailureSignal(startupError, startupStderrLines); +} + diff --git a/electron/utils/openclaw-auth.ts b/electron/utils/openclaw-auth.ts index 6f2a89e8a..e51e4c873 100644 --- a/electron/utils/openclaw-auth.ts +++ b/electron/utils/openclaw-auth.ts @@ -717,4 +717,51 @@ export async function updateAgentModelProvider( } } +/** + * Sanitize ~/.openclaw/openclaw.json before Gateway start. + * + * Removes known-invalid keys that cause OpenClaw's strict Zod validation + * to reject the entire config on startup. Uses a conservative **blocklist** + * approach: only strips keys that are KNOWN to be misplaced by older + * OpenClaw/ClawX versions or external tools. + * + * Why blocklist instead of allowlist? + * • Allowlist (e.g. `VALID_SKILLS_KEYS`) would strip any NEW valid keys + * added by future OpenClaw releases — a forward-compatibility hazard. + * • Blocklist only removes keys we positively know are wrong, so new + * valid keys are never touched. + * + * This is a fast, file-based pre-check. For comprehensive repair of + * unknown or future config issues, the reactive auto-repair mechanism + * (`runOpenClawDoctorRepair`) runs `openclaw doctor --fix` as a fallback. + */ +export async function sanitizeOpenClawConfig(): Promise { + const config = await readOpenClawJson(); + let modified = false; + + // ── skills section ────────────────────────────────────────────── + // OpenClaw's Zod schema uses .strict() on the skills object, accepting + // only: allowBundled, load, install, limits, entries. + // The key "enabled" belongs inside skills.entries[key].enabled, NOT at + // the skills root level. Older versions may have placed it there. + const skills = config.skills; + if (skills && typeof skills === 'object' && !Array.isArray(skills)) { + const skillsObj = skills as Record; + // Keys that are known to be invalid at the skills root level. + const KNOWN_INVALID_SKILLS_ROOT_KEYS = ['enabled', 'disabled']; + for (const key of KNOWN_INVALID_SKILLS_ROOT_KEYS) { + if (key in skillsObj) { + console.log(`[sanitize] Removing misplaced key "skills.${key}" from openclaw.json`); + delete skillsObj[key]; + modified = true; + } + } + } + + if (modified) { + await writeOpenClawJson(config); + console.log('[sanitize] openclaw.json sanitized successfully'); + } +} + export { getProviderEnvVar } from './provider-registry'; diff --git a/package.json b/package.json index 4b040f77c..958031cfa 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "clawx", - "version": "0.1.19", + "version": "0.1.20-alpha.0", "pnpm": { "onlyBuiltDependencies": [ "@whiskeysockets/baileys", diff --git a/tests/unit/gateway-startup-recovery.test.ts b/tests/unit/gateway-startup-recovery.test.ts new file mode 100644 index 000000000..2692385ce --- /dev/null +++ b/tests/unit/gateway-startup-recovery.test.ts @@ -0,0 +1,52 @@ +import { describe, expect, it } from 'vitest'; +import { + hasInvalidConfigFailureSignal, + isInvalidConfigSignal, + shouldAttemptConfigAutoRepair, +} from '@electron/gateway/startup-recovery'; + +describe('gateway startup recovery heuristics', () => { + it('detects invalid-config signal from stderr lines', () => { + const lines = [ + 'Invalid config at C:\\Users\\pc\\.openclaw\\openclaw.json:\\n- skills: Unrecognized key: "enabled"', + 'Run: openclaw doctor --fix', + ]; + expect(hasInvalidConfigFailureSignal(new Error('gateway start failed'), lines)).toBe(true); + }); + + it('detects invalid-config signal from error message fallback', () => { + expect( + hasInvalidConfigFailureSignal( + new Error('Config invalid. Run: openclaw doctor --fix'), + [], + ), + ).toBe(true); + }); + + it('does not treat unrelated startup failures as invalid-config failures', () => { + const lines = [ + 'Gateway process exited (code=1, expected=no)', + 'WebSocket closed before handshake', + ]; + expect( + hasInvalidConfigFailureSignal( + new Error('Gateway process exited before becoming ready (code=1)'), + lines, + ), + ).toBe(false); + }); + + it('attempts auto-repair only once per startup flow', () => { + const lines = ['Config invalid', '- skills: Unrecognized key: "enabled"']; + expect(shouldAttemptConfigAutoRepair(new Error('start failed'), lines, false)).toBe(true); + expect(shouldAttemptConfigAutoRepair(new Error('start failed'), lines, true)).toBe(false); + }); + + it('matches common invalid-config phrases robustly', () => { + expect(isInvalidConfigSignal('Config invalid')).toBe(true); + expect(isInvalidConfigSignal('skills: Unrecognized key: "enabled"')).toBe(true); + expect(isInvalidConfigSignal('Run: openclaw doctor --fix')).toBe(true); + expect(isInvalidConfigSignal('Gateway ready after 3 attempts')).toBe(false); + }); +}); + diff --git a/tests/unit/sanitize-config.test.ts b/tests/unit/sanitize-config.test.ts new file mode 100644 index 000000000..732b22603 --- /dev/null +++ b/tests/unit/sanitize-config.test.ts @@ -0,0 +1,226 @@ +/** + * Tests for openclaw.json config sanitization before Gateway start. + * + * The sanitizeOpenClawConfig() function in openclaw-auth.ts relies on + * Electron-specific helpers (readOpenClawJson / writeOpenClawJson) that + * read from ~/.openclaw/openclaw.json. To avoid mocking Electron + the + * real HOME directory, this test uses a standalone version of the + * sanitization logic that mirrors the production code exactly, operating + * on a temp directory with real file I/O. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtemp, writeFile, readFile, rm } from 'fs/promises'; +import { join } from 'path'; +import { tmpdir } from 'os'; + +let tempDir: string; +let configPath: string; + +async function writeConfig(data: unknown): Promise { + await writeFile(configPath, JSON.stringify(data, null, 2), 'utf-8'); +} + +async function readConfig(): Promise> { + const raw = await readFile(configPath, 'utf-8'); + return JSON.parse(raw); +} + +/** + * Standalone mirror of the sanitization logic in openclaw-auth.ts. + * Uses the same blocklist approach as the production code. + */ +async function sanitizeConfig(filePath: string): Promise { + let raw: string; + try { + raw = await readFile(filePath, 'utf-8'); + } catch { + return false; + } + + const config = JSON.parse(raw) as Record; + let modified = false; + + // Mirror of the production blocklist logic + const skills = config.skills; + if (skills && typeof skills === 'object' && !Array.isArray(skills)) { + const skillsObj = skills as Record; + const KNOWN_INVALID_SKILLS_ROOT_KEYS = ['enabled', 'disabled']; + for (const key of KNOWN_INVALID_SKILLS_ROOT_KEYS) { + if (key in skillsObj) { + delete skillsObj[key]; + modified = true; + } + } + } + + if (modified) { + await writeFile(filePath, JSON.stringify(config, null, 2), 'utf-8'); + } + return modified; +} + +beforeEach(async () => { + tempDir = await mkdtemp(join(tmpdir(), 'clawx-test-')); + configPath = join(tempDir, 'openclaw.json'); +}); + +afterEach(async () => { + await rm(tempDir, { recursive: true, force: true }); +}); + +describe('sanitizeOpenClawConfig (blocklist approach)', () => { + it('removes skills.enabled at the root level of skills', async () => { + await writeConfig({ + skills: { + enabled: true, + entries: { + 'my-skill': { enabled: true, apiKey: 'abc' }, + }, + }, + gateway: { mode: 'local' }, + }); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(true); + + const result = await readConfig(); + // Root-level "enabled" should be gone + expect(result.skills).not.toHaveProperty('enabled'); + // entries[key].enabled must be preserved + const skills = result.skills as Record; + const entries = skills.entries as Record>; + expect(entries['my-skill'].enabled).toBe(true); + expect(entries['my-skill'].apiKey).toBe('abc'); + // Other top-level sections are untouched + expect(result.gateway).toEqual({ mode: 'local' }); + }); + + it('removes skills.disabled at the root level of skills', async () => { + await writeConfig({ + skills: { + disabled: false, + entries: { 'x': { enabled: false } }, + }, + }); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(true); + + const result = await readConfig(); + expect(result.skills).not.toHaveProperty('disabled'); + const skills = result.skills as Record; + const entries = skills.entries as Record>; + expect(entries['x'].enabled).toBe(false); + }); + + it('removes both enabled and disabled when present together', async () => { + await writeConfig({ + skills: { + enabled: true, + disabled: false, + entries: { 'a': { enabled: true } }, + allowBundled: ['web-search'], + }, + }); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(true); + + const result = await readConfig(); + const skills = result.skills as Record; + expect(skills).not.toHaveProperty('enabled'); + expect(skills).not.toHaveProperty('disabled'); + // Valid keys are preserved + expect(skills.allowBundled).toEqual(['web-search']); + expect(skills.entries).toBeDefined(); + }); + + it('does nothing when config is already valid', async () => { + const original = { + skills: { + entries: { 'my-skill': { enabled: true } }, + allowBundled: ['web-search'], + }, + }; + await writeConfig(original); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(false); + + const result = await readConfig(); + expect(result).toEqual(original); + }); + + it('preserves unknown valid keys (forward-compatible)', async () => { + // If OpenClaw adds new valid keys to skills in the future, + // the blocklist approach should NOT strip them. + const original = { + skills: { + entries: { 'x': { enabled: true } }, + allowBundled: ['web-search'], + load: { extraDirs: ['/my/dir'], watch: true }, + install: { preferBrew: false }, + limits: { maxSkillsInPrompt: 5 }, + futureNewKey: { some: 'value' }, // hypothetical future key + }, + }; + await writeConfig(original); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(false); + + const result = await readConfig(); + expect(result).toEqual(original); + }); + + it('handles config with no skills section', async () => { + const original = { gateway: { mode: 'local' } }; + await writeConfig(original); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(false); + }); + + it('handles empty config', async () => { + await writeConfig({}); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(false); + }); + + it('returns false for missing config file', async () => { + const modified = await sanitizeConfig(join(tempDir, 'nonexistent.json')); + expect(modified).toBe(false); + }); + + it('handles skills being an array (no-op, no crash)', async () => { + // Edge case: skills is not an object + await writeConfig({ skills: ['something'] }); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(false); + }); + + it('preserves all other top-level config sections', async () => { + await writeConfig({ + skills: { enabled: true, entries: {} }, + channels: { discord: { token: 'abc', enabled: true } }, + plugins: { entries: { whatsapp: { enabled: true } } }, + gateway: { mode: 'local', auth: { token: 'xyz' } }, + agents: { defaults: { model: { primary: 'gpt-4' } } }, + models: { providers: { openai: { baseUrl: 'https://api.openai.com' } } }, + }); + + const modified = await sanitizeConfig(configPath); + expect(modified).toBe(true); + + const result = await readConfig(); + // skills.enabled removed + expect(result.skills).not.toHaveProperty('enabled'); + // All other sections unchanged + expect(result.channels).toEqual({ discord: { token: 'abc', enabled: true } }); + expect(result.plugins).toEqual({ entries: { whatsapp: { enabled: true } } }); + expect(result.gateway).toEqual({ mode: 'local', auth: { token: 'xyz' } }); + expect(result.agents).toEqual({ defaults: { model: { primary: 'gpt-4' } } }); + }); +});