fix(gateway): handle Windows OpenClaw process exit error during in-process restarts (#794)
Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Haze <hazeone@users.noreply.github.com>
This commit is contained in:
@@ -242,6 +242,7 @@ export class GatewayManager extends EventEmitter {
|
||||
port: this.status.port,
|
||||
ownedPid: this.process?.pid,
|
||||
shouldWaitForPortFree: process.platform === 'win32',
|
||||
hasOwnedProcess: () => this.process?.pid != null && this.ownsProcess,
|
||||
resetStartupStderrLines: () => {
|
||||
this.recentStartupStderrLines = [];
|
||||
},
|
||||
@@ -446,7 +447,16 @@ export class GatewayManager extends EventEmitter {
|
||||
logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
|
||||
this.restartInFlight = (async () => {
|
||||
await this.stop();
|
||||
await this.start();
|
||||
try {
|
||||
await this.start();
|
||||
} catch (err) {
|
||||
// stop() set shouldReconnect=false. Restore it so the gateway
|
||||
// can self-heal via scheduleReconnect() instead of dying permanently.
|
||||
logger.warn('Gateway restart: start() failed after stop(), enabling auto-reconnect recovery', err);
|
||||
this.shouldReconnect = true;
|
||||
this.scheduleReconnect();
|
||||
throw err;
|
||||
}
|
||||
})();
|
||||
|
||||
try {
|
||||
|
||||
@@ -12,6 +12,8 @@ type StartupHooks = {
|
||||
ownedPid?: never; // Removed: pid is now read dynamically in findExistingGateway to avoid stale-snapshot bug
|
||||
shouldWaitForPortFree: boolean;
|
||||
maxStartAttempts?: number;
|
||||
/** Returns true when the manager still owns a living Gateway process (e.g. after a code-1012 in-process restart). */
|
||||
hasOwnedProcess: () => boolean;
|
||||
resetStartupStderrLines: () => void;
|
||||
getStartupStderrLines: () => string[];
|
||||
assertLifecycle: (phase: string) => void;
|
||||
@@ -49,6 +51,22 @@ export async function runGatewayStartupSequence(hooks: StartupHooks): Promise<vo
|
||||
return;
|
||||
}
|
||||
|
||||
// When the Gateway did an in-process restart (WS close 1012), the
|
||||
// UtilityProcess is still alive but its WS server may be mid-rebuild,
|
||||
// so findExistingGateway's quick probe returns null. Rather than
|
||||
// waiting for the port to free (it never will — the process holds it)
|
||||
// and then spawning a duplicate, wait for the existing process to
|
||||
// become ready and reconnect to it.
|
||||
if (hooks.hasOwnedProcess()) {
|
||||
logger.info('Owned Gateway process still alive (likely in-process restart); waiting for it to become ready');
|
||||
await hooks.waitForReady(hooks.port);
|
||||
hooks.assertLifecycle('start/wait-ready-owned');
|
||||
await hooks.connect(hooks.port);
|
||||
hooks.assertLifecycle('start/connect-owned');
|
||||
hooks.onConnectedToExistingGateway();
|
||||
return;
|
||||
}
|
||||
|
||||
logger.debug('No existing Gateway found, starting new process...');
|
||||
|
||||
if (hooks.shouldWaitForPortFree) {
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
import { app, utilityProcess } from 'electron';
|
||||
import path from 'path';
|
||||
import { existsSync } from 'fs';
|
||||
import WebSocket from 'ws';
|
||||
import { getOpenClawDir, getOpenClawEntryPath } from '../utils/paths';
|
||||
import { getUvMirrorEnv } from '../utils/uv-env';
|
||||
import { isPythonReady, setupManagedPython } from '../utils/uv-setup';
|
||||
import { logger } from '../utils/logger';
|
||||
import { prependPathEntry } from '../utils/env-path';
|
||||
import { probeGatewayReady } from './ws-client';
|
||||
|
||||
export function warmupManagedPythonReadiness(): void {
|
||||
void isPythonReady().then((pythonReady) => {
|
||||
@@ -255,27 +255,8 @@ export async function findExistingGatewayProcess(options: {
|
||||
logger.warn('Error checking for existing process on port:', err);
|
||||
}
|
||||
|
||||
return await new Promise<{ port: number; externalToken?: string } | null>((resolve) => {
|
||||
const testWs = new WebSocket(`ws://localhost:${port}/ws`);
|
||||
const terminateAndResolve = (result: { port: number; externalToken?: string } | null) => {
|
||||
// terminate() avoids TIME_WAIT on Windows (vs close() which does WS handshake)
|
||||
try { testWs.terminate(); } catch { /* ignore */ }
|
||||
resolve(result);
|
||||
};
|
||||
const timeout = setTimeout(() => {
|
||||
terminateAndResolve(null);
|
||||
}, 2000);
|
||||
|
||||
testWs.on('open', () => {
|
||||
clearTimeout(timeout);
|
||||
terminateAndResolve({ port });
|
||||
});
|
||||
|
||||
testWs.on('error', () => {
|
||||
clearTimeout(timeout);
|
||||
resolve(null);
|
||||
});
|
||||
});
|
||||
const ready = await probeGatewayReady(port, 5000);
|
||||
return ready ? { port } : null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
|
||||
@@ -134,7 +134,7 @@ export function Channels() {
|
||||
setLoading(false);
|
||||
}
|
||||
// Stable reference — reads state via refs, no deps needed.
|
||||
// eslint-disable-next-line react-hooks/exhaustive-deps
|
||||
|
||||
}, []);
|
||||
|
||||
useEffect(() => {
|
||||
|
||||
146
tests/e2e/gateway-lifecycle.spec.ts
Normal file
146
tests/e2e/gateway-lifecycle.spec.ts
Normal file
@@ -0,0 +1,146 @@
|
||||
import { completeSetup, expect, installIpcMocks, test } from './fixtures/electron';
|
||||
|
||||
test.describe('ClawX gateway lifecycle resilience', () => {
|
||||
test('app remains fully navigable while gateway is disconnected', async ({ page }) => {
|
||||
// In E2E mode, gateway auto-start is skipped, so the app starts
|
||||
// with gateway in "stopped" state — simulating the disconnected scenario.
|
||||
await completeSetup(page);
|
||||
|
||||
// Navigate through all major pages to verify nothing crashes
|
||||
// when the gateway is not running.
|
||||
await page.getByTestId('sidebar-nav-models').click();
|
||||
await expect(page.getByTestId('models-page')).toBeVisible();
|
||||
|
||||
await page.getByTestId('sidebar-nav-agents').click();
|
||||
await expect(page.getByTestId('agents-page')).toBeVisible();
|
||||
|
||||
await page.getByTestId('sidebar-nav-channels').click();
|
||||
await expect(page.getByTestId('channels-page')).toBeVisible();
|
||||
|
||||
await page.getByTestId('sidebar-nav-settings').click();
|
||||
await expect(page.getByTestId('settings-page')).toBeVisible();
|
||||
|
||||
// Navigate back to chat — the gateway status indicator should be visible
|
||||
await page.getByTestId('sidebar-new-chat').click();
|
||||
// Verify the page didn't crash; main layout should still be stable
|
||||
await expect(page.getByTestId('main-layout')).toBeVisible();
|
||||
});
|
||||
|
||||
test('gateway status indicator updates when status transitions occur', async ({ electronApp, page }) => {
|
||||
await completeSetup(page);
|
||||
|
||||
// Mock the initial gateway status as "stopped"
|
||||
await installIpcMocks(electronApp, {
|
||||
gatewayStatus: { state: 'stopped', port: 18789 },
|
||||
});
|
||||
|
||||
// Simulate gateway status transitions by sending IPC events to the renderer.
|
||||
// This mimics the main process emitting gateway:status-changed events.
|
||||
|
||||
// Transition 1: stopped → starting
|
||||
await electronApp.evaluate(({ BrowserWindow }) => {
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win?.webContents.send('gateway:status-changed', {
|
||||
state: 'starting',
|
||||
port: 18789,
|
||||
});
|
||||
});
|
||||
// Wait briefly for the renderer to process the IPC event
|
||||
await page.waitForTimeout(500);
|
||||
|
||||
// Transition 2: starting → running
|
||||
await electronApp.evaluate(({ BrowserWindow }) => {
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win?.webContents.send('gateway:status-changed', {
|
||||
state: 'running',
|
||||
port: 18789,
|
||||
pid: 12345,
|
||||
connectedAt: Date.now(),
|
||||
});
|
||||
});
|
||||
await page.waitForTimeout(500);
|
||||
|
||||
// Verify navigation still works after status transitions
|
||||
await page.getByTestId('sidebar-nav-models').click();
|
||||
await expect(page.getByTestId('models-page')).toBeVisible();
|
||||
|
||||
// Transition 3: running → error (simulates the bug scenario where
|
||||
// gateway becomes unreachable after in-process restart)
|
||||
await electronApp.evaluate(({ BrowserWindow }) => {
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win?.webContents.send('gateway:status-changed', {
|
||||
state: 'error',
|
||||
port: 18789,
|
||||
error: 'WebSocket closed before handshake',
|
||||
});
|
||||
});
|
||||
await page.waitForTimeout(500);
|
||||
|
||||
// App should still be functional in error state
|
||||
await page.getByTestId('sidebar-nav-agents').click();
|
||||
await expect(page.getByTestId('agents-page')).toBeVisible();
|
||||
|
||||
// Transition 4: error → reconnecting → running (the recovery path)
|
||||
await electronApp.evaluate(({ BrowserWindow }) => {
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win?.webContents.send('gateway:status-changed', {
|
||||
state: 'reconnecting',
|
||||
port: 18789,
|
||||
reconnectAttempts: 1,
|
||||
});
|
||||
});
|
||||
await page.waitForTimeout(300);
|
||||
|
||||
await electronApp.evaluate(({ BrowserWindow }) => {
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win?.webContents.send('gateway:status-changed', {
|
||||
state: 'running',
|
||||
port: 18789,
|
||||
pid: 23456,
|
||||
connectedAt: Date.now(),
|
||||
});
|
||||
});
|
||||
await page.waitForTimeout(500);
|
||||
|
||||
// Final navigation check to confirm app is still healthy after full lifecycle
|
||||
await page.getByTestId('sidebar-nav-settings').click();
|
||||
await expect(page.getByTestId('settings-page')).toBeVisible();
|
||||
await page.getByTestId('sidebar-new-chat').click();
|
||||
await expect(page.getByTestId('main-layout')).toBeVisible();
|
||||
});
|
||||
|
||||
test('app handles rapid gateway status transitions without crashing', async ({ electronApp, page }) => {
|
||||
await completeSetup(page);
|
||||
|
||||
// Simulate rapid status transitions like those seen in the bug log:
|
||||
// running → stopped → starting → error → reconnecting → running
|
||||
const states = [
|
||||
{ state: 'running', port: 18789, pid: 100 },
|
||||
{ state: 'stopped', port: 18789 },
|
||||
{ state: 'starting', port: 18789 },
|
||||
{ state: 'error', port: 18789, error: 'Port 18789 still occupied after 30000ms' },
|
||||
{ state: 'reconnecting', port: 18789, reconnectAttempts: 1 },
|
||||
{ state: 'starting', port: 18789 },
|
||||
{ state: 'running', port: 18789, pid: 200, connectedAt: Date.now() },
|
||||
];
|
||||
|
||||
for (const status of states) {
|
||||
await electronApp.evaluate(({ BrowserWindow }, s) => {
|
||||
const win = BrowserWindow.getAllWindows()[0];
|
||||
win?.webContents.send('gateway:status-changed', s);
|
||||
}, status);
|
||||
// Small delay between transitions to be more realistic
|
||||
await page.waitForTimeout(100);
|
||||
}
|
||||
|
||||
// Verify the app is still stable after rapid transitions
|
||||
await expect(page.getByTestId('main-layout')).toBeVisible();
|
||||
|
||||
// Navigate to verify no page is in a broken state
|
||||
await page.getByTestId('sidebar-nav-models').click();
|
||||
await expect(page.getByTestId('models-page')).toBeVisible();
|
||||
|
||||
await page.getByTestId('sidebar-nav-channels').click();
|
||||
await expect(page.getByTestId('channels-page')).toBeVisible();
|
||||
});
|
||||
});
|
||||
122
tests/unit/gateway-manager-restart-recovery.test.ts
Normal file
122
tests/unit/gateway-manager-restart-recovery.test.ts
Normal file
@@ -0,0 +1,122 @@
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
vi.mock('electron', () => ({
|
||||
app: {
|
||||
getPath: () => '/tmp',
|
||||
isPackaged: false,
|
||||
},
|
||||
utilityProcess: {
|
||||
fork: vi.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock('@electron/utils/logger', () => ({
|
||||
logger: {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
debug: vi.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
describe('GatewayManager restart recovery', () => {
|
||||
beforeEach(() => {
|
||||
vi.resetModules();
|
||||
vi.clearAllMocks();
|
||||
vi.useFakeTimers();
|
||||
vi.setSystemTime(new Date('2026-04-08T00:00:00.000Z'));
|
||||
});
|
||||
|
||||
it('re-enables auto-reconnect when start() fails during restart', async () => {
|
||||
const { GatewayManager } = await import('@electron/gateway/manager');
|
||||
const manager = new GatewayManager();
|
||||
|
||||
// Expose private members for testing
|
||||
const internals = manager as unknown as {
|
||||
shouldReconnect: boolean;
|
||||
status: { state: string; port: number };
|
||||
startLock: boolean;
|
||||
reconnectTimer: NodeJS.Timeout | null;
|
||||
restartInFlight: Promise<void> | null;
|
||||
scheduleReconnect: () => void;
|
||||
stop: () => Promise<void>;
|
||||
start: () => Promise<void>;
|
||||
};
|
||||
|
||||
// Set the manager into a state where restart can proceed:
|
||||
// - state must not be 'starting' or 'reconnecting' (would defer restart)
|
||||
// - startLock must be false
|
||||
internals.status = { state: 'running', port: 18789 };
|
||||
internals.startLock = false;
|
||||
internals.shouldReconnect = true;
|
||||
|
||||
// Mock stop to just reset flags (simulates normal stop)
|
||||
vi.spyOn(manager, 'stop').mockImplementation(async () => {
|
||||
internals.shouldReconnect = false;
|
||||
internals.status = { state: 'stopped', port: 18789 };
|
||||
});
|
||||
|
||||
// Mock start to fail (simulates the race condition where gateway
|
||||
// is reachable but not attachable after in-process restart)
|
||||
vi.spyOn(manager, 'start').mockRejectedValue(
|
||||
new Error('WebSocket closed before handshake: unknown'),
|
||||
);
|
||||
|
||||
// Spy on scheduleReconnect
|
||||
const scheduleReconnectSpy = vi.spyOn(
|
||||
internals as unknown as { scheduleReconnect: () => void },
|
||||
'scheduleReconnect',
|
||||
);
|
||||
|
||||
// Perform the restart - it should throw because start() fails
|
||||
await expect(manager.restart()).rejects.toThrow(
|
||||
'WebSocket closed before handshake: unknown',
|
||||
);
|
||||
|
||||
// KEY ASSERTION: After start() fails in restart(), shouldReconnect
|
||||
// must be re-enabled so the gateway can self-heal
|
||||
expect(internals.shouldReconnect).toBe(true);
|
||||
expect(scheduleReconnectSpy).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('does not schedule extra reconnect when restart succeeds', async () => {
|
||||
const { GatewayManager } = await import('@electron/gateway/manager');
|
||||
const manager = new GatewayManager();
|
||||
|
||||
const internals = manager as unknown as {
|
||||
shouldReconnect: boolean;
|
||||
status: { state: string; port: number };
|
||||
startLock: boolean;
|
||||
reconnectTimer: NodeJS.Timeout | null;
|
||||
restartInFlight: Promise<void> | null;
|
||||
scheduleReconnect: () => void;
|
||||
};
|
||||
|
||||
internals.status = { state: 'running', port: 18789 };
|
||||
internals.startLock = false;
|
||||
internals.shouldReconnect = true;
|
||||
|
||||
// Mock stop to reset flags
|
||||
vi.spyOn(manager, 'stop').mockImplementation(async () => {
|
||||
internals.shouldReconnect = false;
|
||||
internals.status = { state: 'stopped', port: 18789 };
|
||||
});
|
||||
|
||||
// Mock start to succeed
|
||||
vi.spyOn(manager, 'start').mockImplementation(async () => {
|
||||
internals.shouldReconnect = true;
|
||||
internals.status = { state: 'running', port: 18789 };
|
||||
});
|
||||
|
||||
const scheduleReconnectSpy = vi.spyOn(
|
||||
internals as unknown as { scheduleReconnect: () => void },
|
||||
'scheduleReconnect',
|
||||
);
|
||||
|
||||
await manager.restart();
|
||||
|
||||
// scheduleReconnect should NOT have been called by the catch block
|
||||
// (it may be called from other paths, but not the restart-recovery catch)
|
||||
expect(scheduleReconnectSpy).not.toHaveBeenCalled();
|
||||
});
|
||||
});
|
||||
259
tests/unit/gateway-startup-orchestrator.test.ts
Normal file
259
tests/unit/gateway-startup-orchestrator.test.ts
Normal file
@@ -0,0 +1,259 @@
|
||||
import { beforeEach, describe, expect, it, vi } from 'vitest';
|
||||
|
||||
vi.mock('electron', () => ({
|
||||
app: {
|
||||
getPath: () => '/tmp',
|
||||
isPackaged: false,
|
||||
},
|
||||
utilityProcess: {},
|
||||
}));
|
||||
|
||||
vi.mock('@electron/utils/logger', () => ({
|
||||
logger: {
|
||||
info: vi.fn(),
|
||||
warn: vi.fn(),
|
||||
error: vi.fn(),
|
||||
debug: vi.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
import { runGatewayStartupSequence } from '@electron/gateway/startup-orchestrator';
|
||||
import { LifecycleSupersededError } from '@electron/gateway/lifecycle-controller';
|
||||
|
||||
function createMockHooks(overrides: Partial<Parameters<typeof runGatewayStartupSequence>[0]> = {}) {
|
||||
return {
|
||||
port: 18789,
|
||||
shouldWaitForPortFree: true,
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
resetStartupStderrLines: vi.fn(),
|
||||
getStartupStderrLines: vi.fn().mockReturnValue([]),
|
||||
assertLifecycle: vi.fn(),
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
connect: vi.fn().mockResolvedValue(undefined),
|
||||
onConnectedToExistingGateway: vi.fn(),
|
||||
waitForPortFree: vi.fn().mockResolvedValue(undefined),
|
||||
startProcess: vi.fn().mockResolvedValue(undefined),
|
||||
waitForReady: vi.fn().mockResolvedValue(undefined),
|
||||
onConnectedToManagedGateway: vi.fn(),
|
||||
runDoctorRepair: vi.fn().mockResolvedValue(false),
|
||||
onDoctorRepairSuccess: vi.fn(),
|
||||
delay: vi.fn().mockResolvedValue(undefined),
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe('runGatewayStartupSequence', () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it('connects to existing gateway when findExistingGateway returns a result', async () => {
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue({ port: 18789, externalToken: 'tok-ext' }),
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.connect).toHaveBeenCalledWith(18789, 'tok-ext');
|
||||
expect(hooks.onConnectedToExistingGateway).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.startProcess).not.toHaveBeenCalled();
|
||||
expect(hooks.onConnectedToManagedGateway).not.toHaveBeenCalled();
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start');
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/find-existing');
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect-existing');
|
||||
});
|
||||
|
||||
it('waits for owned process when hasOwnedProcess returns true (in-process restart path)', async () => {
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(true),
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.hasOwnedProcess).toHaveBeenCalled();
|
||||
expect(hooks.waitForReady).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.connect).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.onConnectedToExistingGateway).toHaveBeenCalledTimes(1);
|
||||
|
||||
// Must NOT start a new process or wait for port free
|
||||
expect(hooks.startProcess).not.toHaveBeenCalled();
|
||||
expect(hooks.waitForPortFree).not.toHaveBeenCalled();
|
||||
expect(hooks.onConnectedToManagedGateway).not.toHaveBeenCalled();
|
||||
|
||||
// Verify lifecycle assertions for the owned-process path
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-ready-owned');
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect-owned');
|
||||
});
|
||||
|
||||
it('starts new process when no existing gateway and no owned process', async () => {
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
shouldWaitForPortFree: true,
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.hasOwnedProcess).toHaveBeenCalled();
|
||||
expect(hooks.waitForPortFree).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.waitForReady).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.connect).toHaveBeenCalledWith(18789);
|
||||
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.onConnectedToExistingGateway).not.toHaveBeenCalled();
|
||||
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-port');
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/start-process');
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-ready');
|
||||
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect');
|
||||
});
|
||||
|
||||
it('skips waitForPortFree when shouldWaitForPortFree is false', async () => {
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
shouldWaitForPortFree: false,
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
expect(hooks.waitForPortFree).not.toHaveBeenCalled();
|
||||
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('retries on transient WebSocket errors', async () => {
|
||||
let callCount = 0;
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
connect: vi.fn().mockImplementation(async () => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
throw new Error('WebSocket closed before handshake: unknown');
|
||||
}
|
||||
}),
|
||||
maxStartAttempts: 3,
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
// First attempt fails with transient error, second succeeds
|
||||
expect(hooks.connect).toHaveBeenCalledTimes(2);
|
||||
expect(hooks.delay).toHaveBeenCalledWith(1000);
|
||||
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('runs doctor repair on config-invalid stderr signal', async () => {
|
||||
let attemptNumber = 0;
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
startProcess: vi.fn().mockImplementation(async () => {
|
||||
attemptNumber++;
|
||||
if (attemptNumber === 1) {
|
||||
throw new Error('Gateway process exited before becoming ready (code=1)');
|
||||
}
|
||||
}),
|
||||
getStartupStderrLines: vi.fn().mockReturnValue([
|
||||
'Config invalid',
|
||||
'Run: openclaw doctor --fix',
|
||||
]),
|
||||
runDoctorRepair: vi.fn().mockResolvedValue(true),
|
||||
maxStartAttempts: 3,
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
expect(hooks.runDoctorRepair).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.onDoctorRepairSuccess).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('fails after max attempts with transient errors', async () => {
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
connect: vi.fn().mockRejectedValue(new Error('WebSocket closed before handshake: unknown')),
|
||||
maxStartAttempts: 3,
|
||||
});
|
||||
|
||||
await expect(runGatewayStartupSequence(hooks)).rejects.toThrow(
|
||||
'WebSocket closed before handshake: unknown',
|
||||
);
|
||||
|
||||
// Should have attempted 3 times total
|
||||
expect(hooks.connect).toHaveBeenCalledTimes(3);
|
||||
expect(hooks.delay).toHaveBeenCalledTimes(2); // delays between retries 1→2, 2→3
|
||||
});
|
||||
|
||||
it('owned process path falls back to retry when waitForReady throws a transient error', async () => {
|
||||
let waitForReadyCalls = 0;
|
||||
let hasOwnedProcessCalls = 0;
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockImplementation(() => {
|
||||
hasOwnedProcessCalls++;
|
||||
// First attempt: owned process is still alive (in-process restart scenario)
|
||||
// Second attempt: process exited, no longer owned
|
||||
return hasOwnedProcessCalls === 1;
|
||||
}),
|
||||
waitForReady: vi.fn().mockImplementation(async () => {
|
||||
waitForReadyCalls++;
|
||||
if (waitForReadyCalls === 1) {
|
||||
throw new Error('WebSocket closed before handshake: unknown');
|
||||
}
|
||||
}),
|
||||
maxStartAttempts: 3,
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
// First attempt: owned-process path → waitForReady throws → retry
|
||||
// Second attempt: not owned → normal start path → succeeds
|
||||
expect(hasOwnedProcessCalls).toBe(2);
|
||||
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.delay).toHaveBeenCalledWith(1000);
|
||||
});
|
||||
|
||||
it('re-throws LifecycleSupersededError without retry', async () => {
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
startProcess: vi.fn().mockRejectedValue(
|
||||
new LifecycleSupersededError('Lifecycle superseded during start'),
|
||||
),
|
||||
});
|
||||
|
||||
await expect(runGatewayStartupSequence(hooks)).rejects.toThrow(LifecycleSupersededError);
|
||||
|
||||
// Should NOT retry — only one attempt
|
||||
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
|
||||
expect(hooks.delay).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('resets startup stderr lines on each attempt', async () => {
|
||||
let callCount = 0;
|
||||
const hooks = createMockHooks({
|
||||
findExistingGateway: vi.fn().mockResolvedValue(null),
|
||||
hasOwnedProcess: vi.fn().mockReturnValue(false),
|
||||
connect: vi.fn().mockImplementation(async () => {
|
||||
callCount++;
|
||||
if (callCount === 1) {
|
||||
throw new Error('WebSocket closed before handshake: unknown');
|
||||
}
|
||||
}),
|
||||
maxStartAttempts: 3,
|
||||
});
|
||||
|
||||
await runGatewayStartupSequence(hooks);
|
||||
|
||||
// resetStartupStderrLines should be called once per attempt
|
||||
expect(hooks.resetStartupStderrLines).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
});
|
||||
@@ -12,6 +12,9 @@ const wsState = vi.hoisted(() => ({
|
||||
this.emit('close', code, Buffer.from(String(reason)));
|
||||
});
|
||||
});
|
||||
readonly terminate = vi.fn(() => {
|
||||
this.readyState = 3;
|
||||
});
|
||||
readonly send = vi.fn((payload: string) => {
|
||||
this.sentFrames.push(payload);
|
||||
});
|
||||
@@ -61,6 +64,7 @@ vi.mock('@electron/utils/logger', () => ({
|
||||
import {
|
||||
GATEWAY_CONNECT_HANDSHAKE_TIMEOUT_MS,
|
||||
connectGatewaySocket,
|
||||
probeGatewayReady,
|
||||
} from '@electron/gateway/ws-client';
|
||||
|
||||
async function flushMicrotasks(): Promise<void> {
|
||||
@@ -182,3 +186,126 @@ describe('connectGatewaySocket', () => {
|
||||
expect(pendingRequests.size).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('probeGatewayReady', () => {
|
||||
beforeEach(() => {
|
||||
vi.useFakeTimers();
|
||||
wsState.sockets.length = 0;
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
vi.useRealTimers();
|
||||
vi.clearAllMocks();
|
||||
wsState.sockets.length = 0;
|
||||
});
|
||||
|
||||
it('resolves true when connect.challenge message is received', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 5000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emitOpen();
|
||||
socket.emitJsonMessage({
|
||||
type: 'event',
|
||||
event: 'connect.challenge',
|
||||
payload: { nonce: 'probe-nonce' },
|
||||
});
|
||||
|
||||
await expect(probePromise).resolves.toBe(true);
|
||||
expect(socket.terminate).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('resolves false on WebSocket error', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 5000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emit('error', new Error('ECONNREFUSED'));
|
||||
|
||||
await expect(probePromise).resolves.toBe(false);
|
||||
expect(socket.terminate).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('resolves false on timeout when no message is received', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 2000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emitOpen();
|
||||
// No message sent — just advance time past timeout
|
||||
await vi.advanceTimersByTimeAsync(2001);
|
||||
|
||||
await expect(probePromise).resolves.toBe(false);
|
||||
expect(socket.terminate).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('resolves false when socket closes before challenge', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 5000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emitOpen();
|
||||
// Emit close directly (not through the mock's close() method)
|
||||
socket.emit('close', 1006, Buffer.from(''));
|
||||
|
||||
await expect(probePromise).resolves.toBe(false);
|
||||
expect(socket.terminate).toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('does NOT resolve true on plain open event (key behavioral change)', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 500);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
// Only emit open — no connect.challenge message
|
||||
socket.emitOpen();
|
||||
|
||||
// The old implementation would have resolved true here.
|
||||
// The new implementation waits for connect.challenge.
|
||||
await vi.advanceTimersByTimeAsync(501);
|
||||
|
||||
await expect(probePromise).resolves.toBe(false);
|
||||
});
|
||||
|
||||
it('uses terminate() instead of close() for cleanup to avoid Windows TIME_WAIT', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 5000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emitOpen();
|
||||
socket.emitJsonMessage({
|
||||
type: 'event',
|
||||
event: 'connect.challenge',
|
||||
payload: { nonce: 'nonce-1' },
|
||||
});
|
||||
|
||||
await expect(probePromise).resolves.toBe(true);
|
||||
|
||||
// Must use terminate(), not close()
|
||||
expect(socket.terminate).toHaveBeenCalledTimes(1);
|
||||
expect(socket.close).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('ignores non-challenge messages', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 1000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emitOpen();
|
||||
// Send a message that is NOT connect.challenge
|
||||
socket.emitJsonMessage({
|
||||
type: 'event',
|
||||
event: 'some.other.event',
|
||||
payload: {},
|
||||
});
|
||||
|
||||
// Should still be waiting — not resolved yet
|
||||
await vi.advanceTimersByTimeAsync(1001);
|
||||
await expect(probePromise).resolves.toBe(false);
|
||||
});
|
||||
|
||||
it('ignores malformed JSON messages', async () => {
|
||||
const probePromise = probeGatewayReady(18789, 1000);
|
||||
const socket = getLatestSocket();
|
||||
|
||||
socket.emitOpen();
|
||||
// Send raw invalid JSON
|
||||
socket.emit('message', Buffer.from('not-json'));
|
||||
|
||||
await vi.advanceTimersByTimeAsync(1001);
|
||||
await expect(probePromise).resolves.toBe(false);
|
||||
});
|
||||
});
|
||||
|
||||
Reference in New Issue
Block a user