fix(gateway): handle Windows OpenClaw process exit error during in-process restarts (#794)

Co-authored-by: Cursor Agent <cursoragent@cursor.com>
Co-authored-by: Haze <hazeone@users.noreply.github.com>
This commit is contained in:
Haze
2026-04-08 12:06:12 +08:00
committed by GitHub
Unverified
parent 3021ad5089
commit 32d14b8cf9
8 changed files with 687 additions and 24 deletions

View File

@@ -242,6 +242,7 @@ export class GatewayManager extends EventEmitter {
port: this.status.port,
ownedPid: this.process?.pid,
shouldWaitForPortFree: process.platform === 'win32',
hasOwnedProcess: () => this.process?.pid != null && this.ownsProcess,
resetStartupStderrLines: () => {
this.recentStartupStderrLines = [];
},
@@ -446,7 +447,16 @@ export class GatewayManager extends EventEmitter {
logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
this.restartInFlight = (async () => {
await this.stop();
await this.start();
try {
await this.start();
} catch (err) {
// stop() set shouldReconnect=false. Restore it so the gateway
// can self-heal via scheduleReconnect() instead of dying permanently.
logger.warn('Gateway restart: start() failed after stop(), enabling auto-reconnect recovery', err);
this.shouldReconnect = true;
this.scheduleReconnect();
throw err;
}
})();
try {

View File

@@ -12,6 +12,8 @@ type StartupHooks = {
ownedPid?: never; // Removed: pid is now read dynamically in findExistingGateway to avoid stale-snapshot bug
shouldWaitForPortFree: boolean;
maxStartAttempts?: number;
/** Returns true when the manager still owns a living Gateway process (e.g. after a code-1012 in-process restart). */
hasOwnedProcess: () => boolean;
resetStartupStderrLines: () => void;
getStartupStderrLines: () => string[];
assertLifecycle: (phase: string) => void;
@@ -49,6 +51,22 @@ export async function runGatewayStartupSequence(hooks: StartupHooks): Promise<vo
return;
}
// When the Gateway did an in-process restart (WS close 1012), the
// UtilityProcess is still alive but its WS server may be mid-rebuild,
// so findExistingGateway's quick probe returns null. Rather than
// waiting for the port to free (it never will — the process holds it)
// and then spawning a duplicate, wait for the existing process to
// become ready and reconnect to it.
if (hooks.hasOwnedProcess()) {
logger.info('Owned Gateway process still alive (likely in-process restart); waiting for it to become ready');
await hooks.waitForReady(hooks.port);
hooks.assertLifecycle('start/wait-ready-owned');
await hooks.connect(hooks.port);
hooks.assertLifecycle('start/connect-owned');
hooks.onConnectedToExistingGateway();
return;
}
logger.debug('No existing Gateway found, starting new process...');
if (hooks.shouldWaitForPortFree) {

View File

@@ -1,12 +1,12 @@
import { app, utilityProcess } from 'electron';
import path from 'path';
import { existsSync } from 'fs';
import WebSocket from 'ws';
import { getOpenClawDir, getOpenClawEntryPath } from '../utils/paths';
import { getUvMirrorEnv } from '../utils/uv-env';
import { isPythonReady, setupManagedPython } from '../utils/uv-setup';
import { logger } from '../utils/logger';
import { prependPathEntry } from '../utils/env-path';
import { probeGatewayReady } from './ws-client';
export function warmupManagedPythonReadiness(): void {
void isPythonReady().then((pythonReady) => {
@@ -255,27 +255,8 @@ export async function findExistingGatewayProcess(options: {
logger.warn('Error checking for existing process on port:', err);
}
return await new Promise<{ port: number; externalToken?: string } | null>((resolve) => {
const testWs = new WebSocket(`ws://localhost:${port}/ws`);
const terminateAndResolve = (result: { port: number; externalToken?: string } | null) => {
// terminate() avoids TIME_WAIT on Windows (vs close() which does WS handshake)
try { testWs.terminate(); } catch { /* ignore */ }
resolve(result);
};
const timeout = setTimeout(() => {
terminateAndResolve(null);
}, 2000);
testWs.on('open', () => {
clearTimeout(timeout);
terminateAndResolve({ port });
});
testWs.on('error', () => {
clearTimeout(timeout);
resolve(null);
});
});
const ready = await probeGatewayReady(port, 5000);
return ready ? { port } : null;
} catch {
return null;
}