fix(gateway): handle Windows OpenClaw process exit error during in-process restarts (#794)
Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Haze <hazeone@users.noreply.github.com>
This commit is contained in:
@@ -242,6 +242,7 @@ export class GatewayManager extends EventEmitter {
|
||||
port: this.status.port,
|
||||
ownedPid: this.process?.pid,
|
||||
shouldWaitForPortFree: process.platform === 'win32',
|
||||
hasOwnedProcess: () => this.process?.pid != null && this.ownsProcess,
|
||||
resetStartupStderrLines: () => {
|
||||
this.recentStartupStderrLines = [];
|
||||
},
|
||||
@@ -446,7 +447,16 @@ export class GatewayManager extends EventEmitter {
|
||||
logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
|
||||
this.restartInFlight = (async () => {
|
||||
await this.stop();
|
||||
await this.start();
|
||||
try {
|
||||
await this.start();
|
||||
} catch (err) {
|
||||
// stop() set shouldReconnect=false. Restore it so the gateway
|
||||
// can self-heal via scheduleReconnect() instead of dying permanently.
|
||||
logger.warn('Gateway restart: start() failed after stop(), enabling auto-reconnect recovery', err);
|
||||
this.shouldReconnect = true;
|
||||
this.scheduleReconnect();
|
||||
throw err;
|
||||
}
|
||||
})();
|
||||
|
||||
try {
|
||||
|
||||
@@ -12,6 +12,8 @@ type StartupHooks = {
|
||||
ownedPid?: never; // Removed: pid is now read dynamically in findExistingGateway to avoid stale-snapshot bug
|
||||
shouldWaitForPortFree: boolean;
|
||||
maxStartAttempts?: number;
|
||||
/** Returns true when the manager still owns a living Gateway process (e.g. after a code-1012 in-process restart). */
|
||||
hasOwnedProcess: () => boolean;
|
||||
resetStartupStderrLines: () => void;
|
||||
getStartupStderrLines: () => string[];
|
||||
assertLifecycle: (phase: string) => void;
|
||||
@@ -49,6 +51,22 @@ export async function runGatewayStartupSequence(hooks: StartupHooks): Promise<vo
|
||||
return;
|
||||
}
|
||||
|
||||
// When the Gateway did an in-process restart (WS close 1012), the
|
||||
// UtilityProcess is still alive but its WS server may be mid-rebuild,
|
||||
// so findExistingGateway's quick probe returns null. Rather than
|
||||
// waiting for the port to free (it never will — the process holds it)
|
||||
// and then spawning a duplicate, wait for the existing process to
|
||||
// become ready and reconnect to it.
|
||||
if (hooks.hasOwnedProcess()) {
|
||||
logger.info('Owned Gateway process still alive (likely in-process restart); waiting for it to become ready');
|
||||
await hooks.waitForReady(hooks.port);
|
||||
hooks.assertLifecycle('start/wait-ready-owned');
|
||||
await hooks.connect(hooks.port);
|
||||
hooks.assertLifecycle('start/connect-owned');
|
||||
hooks.onConnectedToExistingGateway();
|
||||
return;
|
||||
}
|
||||
|
||||
logger.debug('No existing Gateway found, starting new process...');
|
||||
|
||||
if (hooks.shouldWaitForPortFree) {
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
import { app, utilityProcess } from 'electron';
|
||||
import path from 'path';
|
||||
import { existsSync } from 'fs';
|
||||
import WebSocket from 'ws';
|
||||
import { getOpenClawDir, getOpenClawEntryPath } from '../utils/paths';
|
||||
import { getUvMirrorEnv } from '../utils/uv-env';
|
||||
import { isPythonReady, setupManagedPython } from '../utils/uv-setup';
|
||||
import { logger } from '../utils/logger';
|
||||
import { prependPathEntry } from '../utils/env-path';
|
||||
import { probeGatewayReady } from './ws-client';
|
||||
|
||||
export function warmupManagedPythonReadiness(): void {
|
||||
void isPythonReady().then((pythonReady) => {
|
||||
@@ -255,27 +255,8 @@ export async function findExistingGatewayProcess(options: {
|
||||
logger.warn('Error checking for existing process on port:', err);
|
||||
}
|
||||
|
||||
return await new Promise<{ port: number; externalToken?: string } | null>((resolve) => {
|
||||
const testWs = new WebSocket(`ws://localhost:${port}/ws`);
|
||||
const terminateAndResolve = (result: { port: number; externalToken?: string } | null) => {
|
||||
// terminate() avoids TIME_WAIT on Windows (vs close() which does WS handshake)
|
||||
try { testWs.terminate(); } catch { /* ignore */ }
|
||||
resolve(result);
|
||||
};
|
||||
const timeout = setTimeout(() => {
|
||||
terminateAndResolve(null);
|
||||
}, 2000);
|
||||
|
||||
testWs.on('open', () => {
|
||||
clearTimeout(timeout);
|
||||
terminateAndResolve({ port });
|
||||
});
|
||||
|
||||
testWs.on('error', () => {
|
||||
clearTimeout(timeout);
|
||||
resolve(null);
|
||||
});
|
||||
});
|
||||
const ready = await probeGatewayReady(port, 5000);
|
||||
return ready ? { port } : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user