fix(gateway): optimize gateway start robustness in version 0.1.19 (#244)

Co-authored-by: Cursor Agent <cursoragent@cursor.com>
Co-authored-by: Haze <hazeone@users.noreply.github.com>
This commit is contained in:
Haze
2026-03-01 17:42:25 +08:00
committed by GitHub
Unverified
parent 7d0621dcc2
commit 730d5466dd
6 changed files with 579 additions and 40 deletions

View File

@@ -31,7 +31,8 @@ import {
buildDeviceAuthPayload,
type DeviceIdentity,
} from '../utils/device-identity';
import { syncGatewayTokenToConfig, syncBrowserConfigToOpenClaw } from '../utils/openclaw-auth';
import { syncGatewayTokenToConfig, syncBrowserConfigToOpenClaw, sanitizeOpenClawConfig } from '../utils/openclaw-auth';
import { shouldAttemptConfigAutoRepair } from './startup-recovery';
/**
* Gateway connection status
@@ -176,6 +177,7 @@ export class GatewayManager extends EventEmitter {
private shouldReconnect = true;
private startLock = false;
private lastSpawnSummary: string | null = null;
private recentStartupStderrLines: string[] = [];
private pendingRequests: Map<string, {
resolve: (value: unknown) => void;
reject: (error: Error) => void;
@@ -233,6 +235,16 @@ export class GatewayManager extends EventEmitter {
return { level: 'warn', normalized: msg };
}
private recordStartupStderrLine(line: string): void {
const normalized = line.trim();
if (!normalized) return;
this.recentStartupStderrLines.push(normalized);
const MAX_STDERR_LINES = 120;
if (this.recentStartupStderrLines.length > MAX_STDERR_LINES) {
this.recentStartupStderrLines.splice(0, this.recentStartupStderrLines.length - MAX_STDERR_LINES);
}
}
/**
* Get current Gateway status
*/
@@ -279,49 +291,70 @@ export class GatewayManager extends EventEmitter {
this.reconnectAttempts = 0;
this.setStatus({ state: 'starting', reconnectAttempts: 0 });
let configRepairAttempted = false;
// Check if Python environment is ready (self-healing) asynchronously.
// Fire-and-forget: only needs to run once, not on every retry.
void isPythonReady().then(pythonReady => {
if (!pythonReady) {
logger.info('Python environment missing or incomplete, attempting background repair...');
void setupManagedPython().catch(err => {
logger.error('Background Python repair failed:', err);
});
}
}).catch(err => {
logger.error('Failed to check Python environment:', err);
});
try {
// Check if Python environment is ready (self-healing) asynchronously
void isPythonReady().then(pythonReady => {
if (!pythonReady) {
logger.info('Python environment missing or incomplete, attempting background repair...');
// We don't await this to avoid blocking Gateway startup,
// as uv run will handle it if needed, but this pre-warms it.
void setupManagedPython().catch(err => {
logger.error('Background Python repair failed:', err);
});
while (true) {
this.recentStartupStderrLines = [];
try {
// Check if Gateway is already running
logger.debug('Checking for existing Gateway...');
const existing = await this.findExistingGateway();
if (existing) {
logger.debug(`Found existing Gateway on port ${existing.port}`);
await this.connect(existing.port, existing.externalToken);
this.ownsProcess = false;
this.setStatus({ pid: undefined });
this.startHealthCheck();
return;
}
logger.debug('No existing Gateway found, starting new process...');
// Start new Gateway process
await this.startProcess();
// Wait for Gateway to be ready
await this.waitForReady();
// Connect WebSocket
await this.connect(this.status.port);
// Start health monitoring
this.startHealthCheck();
logger.debug('Gateway started successfully');
return;
} catch (error) {
if (shouldAttemptConfigAutoRepair(error, this.recentStartupStderrLines, configRepairAttempted)) {
configRepairAttempted = true;
logger.warn(
'Detected invalid OpenClaw config during Gateway startup; running doctor repair before retry'
);
const repaired = await this.runOpenClawDoctorRepair();
if (repaired) {
logger.info('OpenClaw doctor repair completed; retrying Gateway startup');
this.setStatus({ state: 'starting', error: undefined, reconnectAttempts: 0 });
continue;
}
logger.error('OpenClaw doctor repair failed; not retrying Gateway startup');
}
throw error;
}
}).catch(err => {
logger.error('Failed to check Python environment:', err);
});
// Check if Gateway is already running
logger.debug('Checking for existing Gateway...');
const existing = await this.findExistingGateway();
if (existing) {
logger.debug(`Found existing Gateway on port ${existing.port}`);
await this.connect(existing.port, existing.externalToken);
this.ownsProcess = false;
this.setStatus({ pid: undefined });
this.startHealthCheck();
return;
}
logger.debug('No existing Gateway found, starting new process...');
// Start new Gateway process
await this.startProcess();
// Wait for Gateway to be ready
await this.waitForReady();
// Connect WebSocket
await this.connect(this.status.port);
// Start health monitoring
this.startHealthCheck();
logger.debug('Gateway started successfully');
} catch (error) {
logger.error(
`Gateway start failed (port=${this.status.port}, reconnectAttempts=${this.reconnectAttempts}, spawn=${this.lastSpawnSummary ?? 'n/a'})`,
@@ -710,6 +743,115 @@ export class GatewayManager extends EventEmitter {
return null;
}
/**
* Attempt to repair invalid OpenClaw config using the built-in doctor command.
* Returns true when doctor exits successfully.
*/
private async runOpenClawDoctorRepair(): Promise<boolean> {
const openclawDir = getOpenClawDir();
const entryScript = getOpenClawEntryPath();
if (!existsSync(entryScript)) {
logger.error(`Cannot run OpenClaw doctor repair: entry script not found at ${entryScript}`);
return false;
}
const platform = process.platform;
const arch = process.arch;
const target = `${platform}-${arch}`;
const binPath = app.isPackaged
? path.join(process.resourcesPath, 'bin')
: path.join(process.cwd(), 'resources', 'bin', target);
const binPathExists = existsSync(binPath);
const finalPath = binPathExists
? `${binPath}${path.delimiter}${process.env.PATH || ''}`
: process.env.PATH || '';
const uvEnv = await getUvMirrorEnv();
const command = app.isPackaged ? getNodeExecutablePath() : 'node';
const args = [entryScript, 'doctor', '--fix', '--yes', '--non-interactive'];
logger.info(
`Running OpenClaw doctor repair (command="${command}", args="${args.join(' ')}", cwd="${openclawDir}", bundledBin=${binPathExists ? 'yes' : 'no'})`
);
return new Promise<boolean>((resolve) => {
const spawnEnv: Record<string, string | undefined> = {
...process.env,
PATH: finalPath,
...uvEnv,
};
if (app.isPackaged) {
spawnEnv['ELECTRON_RUN_AS_NODE'] = '1';
spawnEnv['OPENCLAW_NO_RESPAWN'] = '1';
const existingNodeOpts = spawnEnv['NODE_OPTIONS'] ?? '';
if (!existingNodeOpts.includes('--disable-warning=ExperimentalWarning') &&
!existingNodeOpts.includes('--no-warnings')) {
spawnEnv['NODE_OPTIONS'] = `${existingNodeOpts} --disable-warning=ExperimentalWarning`.trim();
}
}
const child = spawn(command, args, {
cwd: openclawDir,
stdio: ['ignore', 'pipe', 'pipe'],
detached: false,
shell: false,
env: spawnEnv,
});
let settled = false;
const finish = (ok: boolean) => {
if (settled) return;
settled = true;
resolve(ok);
};
const timeout = setTimeout(() => {
logger.error('OpenClaw doctor repair timed out after 120000ms');
try {
child.kill('SIGTERM');
} catch {
// ignore
}
finish(false);
}, 120000);
child.on('error', (err) => {
clearTimeout(timeout);
logger.error('Failed to spawn OpenClaw doctor repair process:', err);
finish(false);
});
child.stdout?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
const normalized = line.trim();
if (!normalized) continue;
logger.debug(`[Gateway doctor stdout] ${normalized}`);
}
});
child.stderr?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
const normalized = line.trim();
if (!normalized) continue;
logger.warn(`[Gateway doctor stderr] ${normalized}`);
}
});
child.on('exit', (code, signal) => {
clearTimeout(timeout);
if (code === 0) {
logger.info('OpenClaw doctor repair completed successfully');
finish(true);
return;
}
logger.warn(`OpenClaw doctor repair exited (${this.formatExit(code, signal)})`);
finish(false);
});
});
}
/**
* Start Gateway process
* Uses OpenClaw npm package from node_modules (dev) or resources (production)
@@ -731,6 +873,17 @@ export class GatewayManager extends EventEmitter {
// Get or generate gateway token
const gatewayToken = await getSetting('gatewayToken');
// Strip stale/invalid keys from openclaw.json that would cause the
// Gateway's strict config validation to reject the file on startup
// (e.g. `skills.enabled` left by an older version).
// This is a fast file-based pre-check; the reactive auto-repair
// mechanism (runOpenClawDoctorRepair) handles any remaining issues.
try {
await sanitizeOpenClawConfig();
} catch (err) {
logger.warn('Failed to sanitize openclaw.json:', err);
}
// Write our token into openclaw.json before starting the process.
// Without --dev the gateway authenticates using the token in
// openclaw.json; if that file has a stale token (e.g. left by the
@@ -924,6 +1077,7 @@ export class GatewayManager extends EventEmitter {
child.stderr?.on('data', (data) => {
const raw = data.toString();
for (const line of raw.split(/\r?\n/)) {
this.recordStartupStderrLine(line);
const classified = this.classifyStderrMessage(line);
if (classified.level === 'drop') continue;
if (classified.level === 'debug') {

View File

@@ -0,0 +1,60 @@
/**
* Gateway startup recovery heuristics.
*
* This module is intentionally dependency-free so it can be unit-tested
* without Electron/runtime mocks.
*/
const INVALID_CONFIG_PATTERNS: RegExp[] = [
/\binvalid config\b/i,
/\bconfig invalid\b/i,
/\bunrecognized key\b/i,
/\brun:\s*openclaw doctor --fix\b/i,
];
function normalizeLogLine(value: string): string {
return value.trim();
}
/**
* Returns true when text appears to indicate OpenClaw config validation failure.
*/
export function isInvalidConfigSignal(text: string): boolean {
const normalized = normalizeLogLine(text);
if (!normalized) return false;
return INVALID_CONFIG_PATTERNS.some((pattern) => pattern.test(normalized));
}
/**
* Returns true when either startup stderr lines or startup error message
* indicate an OpenClaw config validation failure.
*/
export function hasInvalidConfigFailureSignal(
startupError: unknown,
startupStderrLines: string[],
): boolean {
for (const line of startupStderrLines) {
if (isInvalidConfigSignal(line)) {
return true;
}
}
const errorText = startupError instanceof Error
? `${startupError.name}: ${startupError.message}`
: String(startupError ?? '');
return isInvalidConfigSignal(errorText);
}
/**
* Retry guard for one-time config repair during a single startup flow.
*/
export function shouldAttemptConfigAutoRepair(
startupError: unknown,
startupStderrLines: string[],
alreadyAttempted: boolean,
): boolean {
if (alreadyAttempted) return false;
return hasInvalidConfigFailureSignal(startupError, startupStderrLines);
}

View File

@@ -717,4 +717,51 @@ export async function updateAgentModelProvider(
}
}
/**
* Sanitize ~/.openclaw/openclaw.json before Gateway start.
*
* Removes known-invalid keys that cause OpenClaw's strict Zod validation
* to reject the entire config on startup. Uses a conservative **blocklist**
* approach: only strips keys that are KNOWN to be misplaced by older
* OpenClaw/ClawX versions or external tools.
*
* Why blocklist instead of allowlist?
* • Allowlist (e.g. `VALID_SKILLS_KEYS`) would strip any NEW valid keys
* added by future OpenClaw releases — a forward-compatibility hazard.
* • Blocklist only removes keys we positively know are wrong, so new
* valid keys are never touched.
*
* This is a fast, file-based pre-check. For comprehensive repair of
* unknown or future config issues, the reactive auto-repair mechanism
* (`runOpenClawDoctorRepair`) runs `openclaw doctor --fix` as a fallback.
*/
export async function sanitizeOpenClawConfig(): Promise<void> {
const config = await readOpenClawJson();
let modified = false;
// ── skills section ──────────────────────────────────────────────
// OpenClaw's Zod schema uses .strict() on the skills object, accepting
// only: allowBundled, load, install, limits, entries.
// The key "enabled" belongs inside skills.entries[key].enabled, NOT at
// the skills root level. Older versions may have placed it there.
const skills = config.skills;
if (skills && typeof skills === 'object' && !Array.isArray(skills)) {
const skillsObj = skills as Record<string, unknown>;
// Keys that are known to be invalid at the skills root level.
const KNOWN_INVALID_SKILLS_ROOT_KEYS = ['enabled', 'disabled'];
for (const key of KNOWN_INVALID_SKILLS_ROOT_KEYS) {
if (key in skillsObj) {
console.log(`[sanitize] Removing misplaced key "skills.${key}" from openclaw.json`);
delete skillsObj[key];
modified = true;
}
}
}
if (modified) {
await writeOpenClawJson(config);
console.log('[sanitize] openclaw.json sanitized successfully');
}
}
export { getProviderEnvVar } from './provider-registry';

View File

@@ -1,6 +1,6 @@
{
"name": "clawx",
"version": "0.1.19",
"version": "0.1.20-alpha.0",
"pnpm": {
"onlyBuiltDependencies": [
"@whiskeysockets/baileys",

View File

@@ -0,0 +1,52 @@
import { describe, expect, it } from 'vitest';
import {
hasInvalidConfigFailureSignal,
isInvalidConfigSignal,
shouldAttemptConfigAutoRepair,
} from '@electron/gateway/startup-recovery';
describe('gateway startup recovery heuristics', () => {
it('detects invalid-config signal from stderr lines', () => {
const lines = [
'Invalid config at C:\\Users\\pc\\.openclaw\\openclaw.json:\\n- skills: Unrecognized key: "enabled"',
'Run: openclaw doctor --fix',
];
expect(hasInvalidConfigFailureSignal(new Error('gateway start failed'), lines)).toBe(true);
});
it('detects invalid-config signal from error message fallback', () => {
expect(
hasInvalidConfigFailureSignal(
new Error('Config invalid. Run: openclaw doctor --fix'),
[],
),
).toBe(true);
});
it('does not treat unrelated startup failures as invalid-config failures', () => {
const lines = [
'Gateway process exited (code=1, expected=no)',
'WebSocket closed before handshake',
];
expect(
hasInvalidConfigFailureSignal(
new Error('Gateway process exited before becoming ready (code=1)'),
lines,
),
).toBe(false);
});
it('attempts auto-repair only once per startup flow', () => {
const lines = ['Config invalid', '- skills: Unrecognized key: "enabled"'];
expect(shouldAttemptConfigAutoRepair(new Error('start failed'), lines, false)).toBe(true);
expect(shouldAttemptConfigAutoRepair(new Error('start failed'), lines, true)).toBe(false);
});
it('matches common invalid-config phrases robustly', () => {
expect(isInvalidConfigSignal('Config invalid')).toBe(true);
expect(isInvalidConfigSignal('skills: Unrecognized key: "enabled"')).toBe(true);
expect(isInvalidConfigSignal('Run: openclaw doctor --fix')).toBe(true);
expect(isInvalidConfigSignal('Gateway ready after 3 attempts')).toBe(false);
});
});

View File

@@ -0,0 +1,226 @@
/**
* Tests for openclaw.json config sanitization before Gateway start.
*
* The sanitizeOpenClawConfig() function in openclaw-auth.ts relies on
* Electron-specific helpers (readOpenClawJson / writeOpenClawJson) that
* read from ~/.openclaw/openclaw.json. To avoid mocking Electron + the
* real HOME directory, this test uses a standalone version of the
* sanitization logic that mirrors the production code exactly, operating
* on a temp directory with real file I/O.
*/
import { describe, it, expect, beforeEach, afterEach } from 'vitest';
import { mkdtemp, writeFile, readFile, rm } from 'fs/promises';
import { join } from 'path';
import { tmpdir } from 'os';
let tempDir: string;
let configPath: string;
async function writeConfig(data: unknown): Promise<void> {
await writeFile(configPath, JSON.stringify(data, null, 2), 'utf-8');
}
async function readConfig(): Promise<Record<string, unknown>> {
const raw = await readFile(configPath, 'utf-8');
return JSON.parse(raw);
}
/**
* Standalone mirror of the sanitization logic in openclaw-auth.ts.
* Uses the same blocklist approach as the production code.
*/
async function sanitizeConfig(filePath: string): Promise<boolean> {
let raw: string;
try {
raw = await readFile(filePath, 'utf-8');
} catch {
return false;
}
const config = JSON.parse(raw) as Record<string, unknown>;
let modified = false;
// Mirror of the production blocklist logic
const skills = config.skills;
if (skills && typeof skills === 'object' && !Array.isArray(skills)) {
const skillsObj = skills as Record<string, unknown>;
const KNOWN_INVALID_SKILLS_ROOT_KEYS = ['enabled', 'disabled'];
for (const key of KNOWN_INVALID_SKILLS_ROOT_KEYS) {
if (key in skillsObj) {
delete skillsObj[key];
modified = true;
}
}
}
if (modified) {
await writeFile(filePath, JSON.stringify(config, null, 2), 'utf-8');
}
return modified;
}
beforeEach(async () => {
tempDir = await mkdtemp(join(tmpdir(), 'clawx-test-'));
configPath = join(tempDir, 'openclaw.json');
});
afterEach(async () => {
await rm(tempDir, { recursive: true, force: true });
});
describe('sanitizeOpenClawConfig (blocklist approach)', () => {
it('removes skills.enabled at the root level of skills', async () => {
await writeConfig({
skills: {
enabled: true,
entries: {
'my-skill': { enabled: true, apiKey: 'abc' },
},
},
gateway: { mode: 'local' },
});
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(true);
const result = await readConfig();
// Root-level "enabled" should be gone
expect(result.skills).not.toHaveProperty('enabled');
// entries[key].enabled must be preserved
const skills = result.skills as Record<string, unknown>;
const entries = skills.entries as Record<string, Record<string, unknown>>;
expect(entries['my-skill'].enabled).toBe(true);
expect(entries['my-skill'].apiKey).toBe('abc');
// Other top-level sections are untouched
expect(result.gateway).toEqual({ mode: 'local' });
});
it('removes skills.disabled at the root level of skills', async () => {
await writeConfig({
skills: {
disabled: false,
entries: { 'x': { enabled: false } },
},
});
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(true);
const result = await readConfig();
expect(result.skills).not.toHaveProperty('disabled');
const skills = result.skills as Record<string, unknown>;
const entries = skills.entries as Record<string, Record<string, unknown>>;
expect(entries['x'].enabled).toBe(false);
});
it('removes both enabled and disabled when present together', async () => {
await writeConfig({
skills: {
enabled: true,
disabled: false,
entries: { 'a': { enabled: true } },
allowBundled: ['web-search'],
},
});
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(true);
const result = await readConfig();
const skills = result.skills as Record<string, unknown>;
expect(skills).not.toHaveProperty('enabled');
expect(skills).not.toHaveProperty('disabled');
// Valid keys are preserved
expect(skills.allowBundled).toEqual(['web-search']);
expect(skills.entries).toBeDefined();
});
it('does nothing when config is already valid', async () => {
const original = {
skills: {
entries: { 'my-skill': { enabled: true } },
allowBundled: ['web-search'],
},
};
await writeConfig(original);
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(false);
const result = await readConfig();
expect(result).toEqual(original);
});
it('preserves unknown valid keys (forward-compatible)', async () => {
// If OpenClaw adds new valid keys to skills in the future,
// the blocklist approach should NOT strip them.
const original = {
skills: {
entries: { 'x': { enabled: true } },
allowBundled: ['web-search'],
load: { extraDirs: ['/my/dir'], watch: true },
install: { preferBrew: false },
limits: { maxSkillsInPrompt: 5 },
futureNewKey: { some: 'value' }, // hypothetical future key
},
};
await writeConfig(original);
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(false);
const result = await readConfig();
expect(result).toEqual(original);
});
it('handles config with no skills section', async () => {
const original = { gateway: { mode: 'local' } };
await writeConfig(original);
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(false);
});
it('handles empty config', async () => {
await writeConfig({});
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(false);
});
it('returns false for missing config file', async () => {
const modified = await sanitizeConfig(join(tempDir, 'nonexistent.json'));
expect(modified).toBe(false);
});
it('handles skills being an array (no-op, no crash)', async () => {
// Edge case: skills is not an object
await writeConfig({ skills: ['something'] });
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(false);
});
it('preserves all other top-level config sections', async () => {
await writeConfig({
skills: { enabled: true, entries: {} },
channels: { discord: { token: 'abc', enabled: true } },
plugins: { entries: { whatsapp: { enabled: true } } },
gateway: { mode: 'local', auth: { token: 'xyz' } },
agents: { defaults: { model: { primary: 'gpt-4' } } },
models: { providers: { openai: { baseUrl: 'https://api.openai.com' } } },
});
const modified = await sanitizeConfig(configPath);
expect(modified).toBe(true);
const result = await readConfig();
// skills.enabled removed
expect(result.skills).not.toHaveProperty('enabled');
// All other sections unchanged
expect(result.channels).toEqual({ discord: { token: 'abc', enabled: true } });
expect(result.plugins).toEqual({ entries: { whatsapp: { enabled: true } } });
expect(result.gateway).toEqual({ mode: 'local', auth: { token: 'xyz' } });
expect(result.agents).toEqual({ defaults: { model: { primary: 'gpt-4' } } });
});
});