fix(gateway): handle Windows OpenClaw process exit error during in-process restarts (#794)

Co-authored-by: Cursor Agent <cursoragent@cursor.com>
Co-authored-by: Haze <hazeone@users.noreply.github.com>
This commit is contained in:
Haze
2026-04-08 12:06:12 +08:00
committed by GitHub
Unverified
parent 3021ad5089
commit 32d14b8cf9
8 changed files with 687 additions and 24 deletions

View File

@@ -0,0 +1,122 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
vi.mock('electron', () => ({
app: {
getPath: () => '/tmp',
isPackaged: false,
},
utilityProcess: {
fork: vi.fn(),
},
}));
vi.mock('@electron/utils/logger', () => ({
logger: {
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
debug: vi.fn(),
},
}));
describe('GatewayManager restart recovery', () => {
beforeEach(() => {
vi.resetModules();
vi.clearAllMocks();
vi.useFakeTimers();
vi.setSystemTime(new Date('2026-04-08T00:00:00.000Z'));
});
it('re-enables auto-reconnect when start() fails during restart', async () => {
const { GatewayManager } = await import('@electron/gateway/manager');
const manager = new GatewayManager();
// Expose private members for testing
const internals = manager as unknown as {
shouldReconnect: boolean;
status: { state: string; port: number };
startLock: boolean;
reconnectTimer: NodeJS.Timeout | null;
restartInFlight: Promise<void> | null;
scheduleReconnect: () => void;
stop: () => Promise<void>;
start: () => Promise<void>;
};
// Set the manager into a state where restart can proceed:
// - state must not be 'starting' or 'reconnecting' (would defer restart)
// - startLock must be false
internals.status = { state: 'running', port: 18789 };
internals.startLock = false;
internals.shouldReconnect = true;
// Mock stop to just reset flags (simulates normal stop)
vi.spyOn(manager, 'stop').mockImplementation(async () => {
internals.shouldReconnect = false;
internals.status = { state: 'stopped', port: 18789 };
});
// Mock start to fail (simulates the race condition where gateway
// is reachable but not attachable after in-process restart)
vi.spyOn(manager, 'start').mockRejectedValue(
new Error('WebSocket closed before handshake: unknown'),
);
// Spy on scheduleReconnect
const scheduleReconnectSpy = vi.spyOn(
internals as unknown as { scheduleReconnect: () => void },
'scheduleReconnect',
);
// Perform the restart - it should throw because start() fails
await expect(manager.restart()).rejects.toThrow(
'WebSocket closed before handshake: unknown',
);
// KEY ASSERTION: After start() fails in restart(), shouldReconnect
// must be re-enabled so the gateway can self-heal
expect(internals.shouldReconnect).toBe(true);
expect(scheduleReconnectSpy).toHaveBeenCalled();
});
it('does not schedule extra reconnect when restart succeeds', async () => {
const { GatewayManager } = await import('@electron/gateway/manager');
const manager = new GatewayManager();
const internals = manager as unknown as {
shouldReconnect: boolean;
status: { state: string; port: number };
startLock: boolean;
reconnectTimer: NodeJS.Timeout | null;
restartInFlight: Promise<void> | null;
scheduleReconnect: () => void;
};
internals.status = { state: 'running', port: 18789 };
internals.startLock = false;
internals.shouldReconnect = true;
// Mock stop to reset flags
vi.spyOn(manager, 'stop').mockImplementation(async () => {
internals.shouldReconnect = false;
internals.status = { state: 'stopped', port: 18789 };
});
// Mock start to succeed
vi.spyOn(manager, 'start').mockImplementation(async () => {
internals.shouldReconnect = true;
internals.status = { state: 'running', port: 18789 };
});
const scheduleReconnectSpy = vi.spyOn(
internals as unknown as { scheduleReconnect: () => void },
'scheduleReconnect',
);
await manager.restart();
// scheduleReconnect should NOT have been called by the catch block
// (it may be called from other paths, but not the restart-recovery catch)
expect(scheduleReconnectSpy).not.toHaveBeenCalled();
});
});

View File

@@ -0,0 +1,259 @@
import { beforeEach, describe, expect, it, vi } from 'vitest';
vi.mock('electron', () => ({
app: {
getPath: () => '/tmp',
isPackaged: false,
},
utilityProcess: {},
}));
vi.mock('@electron/utils/logger', () => ({
logger: {
info: vi.fn(),
warn: vi.fn(),
error: vi.fn(),
debug: vi.fn(),
},
}));
import { runGatewayStartupSequence } from '@electron/gateway/startup-orchestrator';
import { LifecycleSupersededError } from '@electron/gateway/lifecycle-controller';
function createMockHooks(overrides: Partial<Parameters<typeof runGatewayStartupSequence>[0]> = {}) {
return {
port: 18789,
shouldWaitForPortFree: true,
hasOwnedProcess: vi.fn().mockReturnValue(false),
resetStartupStderrLines: vi.fn(),
getStartupStderrLines: vi.fn().mockReturnValue([]),
assertLifecycle: vi.fn(),
findExistingGateway: vi.fn().mockResolvedValue(null),
connect: vi.fn().mockResolvedValue(undefined),
onConnectedToExistingGateway: vi.fn(),
waitForPortFree: vi.fn().mockResolvedValue(undefined),
startProcess: vi.fn().mockResolvedValue(undefined),
waitForReady: vi.fn().mockResolvedValue(undefined),
onConnectedToManagedGateway: vi.fn(),
runDoctorRepair: vi.fn().mockResolvedValue(false),
onDoctorRepairSuccess: vi.fn(),
delay: vi.fn().mockResolvedValue(undefined),
...overrides,
};
}
describe('runGatewayStartupSequence', () => {
beforeEach(() => {
vi.clearAllMocks();
});
it('connects to existing gateway when findExistingGateway returns a result', async () => {
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue({ port: 18789, externalToken: 'tok-ext' }),
});
await runGatewayStartupSequence(hooks);
expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
expect(hooks.connect).toHaveBeenCalledWith(18789, 'tok-ext');
expect(hooks.onConnectedToExistingGateway).toHaveBeenCalledTimes(1);
expect(hooks.startProcess).not.toHaveBeenCalled();
expect(hooks.onConnectedToManagedGateway).not.toHaveBeenCalled();
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start');
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/find-existing');
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect-existing');
});
it('waits for owned process when hasOwnedProcess returns true (in-process restart path)', async () => {
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(true),
});
await runGatewayStartupSequence(hooks);
expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
expect(hooks.hasOwnedProcess).toHaveBeenCalled();
expect(hooks.waitForReady).toHaveBeenCalledWith(18789);
expect(hooks.connect).toHaveBeenCalledWith(18789);
expect(hooks.onConnectedToExistingGateway).toHaveBeenCalledTimes(1);
// Must NOT start a new process or wait for port free
expect(hooks.startProcess).not.toHaveBeenCalled();
expect(hooks.waitForPortFree).not.toHaveBeenCalled();
expect(hooks.onConnectedToManagedGateway).not.toHaveBeenCalled();
// Verify lifecycle assertions for the owned-process path
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-ready-owned');
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect-owned');
});
it('starts new process when no existing gateway and no owned process', async () => {
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
shouldWaitForPortFree: true,
});
await runGatewayStartupSequence(hooks);
expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
expect(hooks.hasOwnedProcess).toHaveBeenCalled();
expect(hooks.waitForPortFree).toHaveBeenCalledWith(18789);
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
expect(hooks.waitForReady).toHaveBeenCalledWith(18789);
expect(hooks.connect).toHaveBeenCalledWith(18789);
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
expect(hooks.onConnectedToExistingGateway).not.toHaveBeenCalled();
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-port');
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/start-process');
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-ready');
expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect');
});
it('skips waitForPortFree when shouldWaitForPortFree is false', async () => {
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
shouldWaitForPortFree: false,
});
await runGatewayStartupSequence(hooks);
expect(hooks.waitForPortFree).not.toHaveBeenCalled();
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
});
it('retries on transient WebSocket errors', async () => {
let callCount = 0;
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
connect: vi.fn().mockImplementation(async () => {
callCount++;
if (callCount === 1) {
throw new Error('WebSocket closed before handshake: unknown');
}
}),
maxStartAttempts: 3,
});
await runGatewayStartupSequence(hooks);
// First attempt fails with transient error, second succeeds
expect(hooks.connect).toHaveBeenCalledTimes(2);
expect(hooks.delay).toHaveBeenCalledWith(1000);
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
});
it('runs doctor repair on config-invalid stderr signal', async () => {
let attemptNumber = 0;
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
startProcess: vi.fn().mockImplementation(async () => {
attemptNumber++;
if (attemptNumber === 1) {
throw new Error('Gateway process exited before becoming ready (code=1)');
}
}),
getStartupStderrLines: vi.fn().mockReturnValue([
'Config invalid',
'Run: openclaw doctor --fix',
]),
runDoctorRepair: vi.fn().mockResolvedValue(true),
maxStartAttempts: 3,
});
await runGatewayStartupSequence(hooks);
expect(hooks.runDoctorRepair).toHaveBeenCalledTimes(1);
expect(hooks.onDoctorRepairSuccess).toHaveBeenCalledTimes(1);
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
});
it('fails after max attempts with transient errors', async () => {
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
connect: vi.fn().mockRejectedValue(new Error('WebSocket closed before handshake: unknown')),
maxStartAttempts: 3,
});
await expect(runGatewayStartupSequence(hooks)).rejects.toThrow(
'WebSocket closed before handshake: unknown',
);
// Should have attempted 3 times total
expect(hooks.connect).toHaveBeenCalledTimes(3);
expect(hooks.delay).toHaveBeenCalledTimes(2); // delays between retries 1→2, 2→3
});
it('owned process path falls back to retry when waitForReady throws a transient error', async () => {
let waitForReadyCalls = 0;
let hasOwnedProcessCalls = 0;
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockImplementation(() => {
hasOwnedProcessCalls++;
// First attempt: owned process is still alive (in-process restart scenario)
// Second attempt: process exited, no longer owned
return hasOwnedProcessCalls === 1;
}),
waitForReady: vi.fn().mockImplementation(async () => {
waitForReadyCalls++;
if (waitForReadyCalls === 1) {
throw new Error('WebSocket closed before handshake: unknown');
}
}),
maxStartAttempts: 3,
});
await runGatewayStartupSequence(hooks);
// First attempt: owned-process path → waitForReady throws → retry
// Second attempt: not owned → normal start path → succeeds
expect(hasOwnedProcessCalls).toBe(2);
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
expect(hooks.delay).toHaveBeenCalledWith(1000);
});
it('re-throws LifecycleSupersededError without retry', async () => {
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
startProcess: vi.fn().mockRejectedValue(
new LifecycleSupersededError('Lifecycle superseded during start'),
),
});
await expect(runGatewayStartupSequence(hooks)).rejects.toThrow(LifecycleSupersededError);
// Should NOT retry — only one attempt
expect(hooks.startProcess).toHaveBeenCalledTimes(1);
expect(hooks.delay).not.toHaveBeenCalled();
});
it('resets startup stderr lines on each attempt', async () => {
let callCount = 0;
const hooks = createMockHooks({
findExistingGateway: vi.fn().mockResolvedValue(null),
hasOwnedProcess: vi.fn().mockReturnValue(false),
connect: vi.fn().mockImplementation(async () => {
callCount++;
if (callCount === 1) {
throw new Error('WebSocket closed before handshake: unknown');
}
}),
maxStartAttempts: 3,
});
await runGatewayStartupSequence(hooks);
// resetStartupStderrLines should be called once per attempt
expect(hooks.resetStartupStderrLines).toHaveBeenCalledTimes(2);
});
});

View File

@@ -12,6 +12,9 @@ const wsState = vi.hoisted(() => ({
this.emit('close', code, Buffer.from(String(reason)));
});
});
readonly terminate = vi.fn(() => {
this.readyState = 3;
});
readonly send = vi.fn((payload: string) => {
this.sentFrames.push(payload);
});
@@ -61,6 +64,7 @@ vi.mock('@electron/utils/logger', () => ({
import {
GATEWAY_CONNECT_HANDSHAKE_TIMEOUT_MS,
connectGatewaySocket,
probeGatewayReady,
} from '@electron/gateway/ws-client';
async function flushMicrotasks(): Promise<void> {
@@ -182,3 +186,126 @@ describe('connectGatewaySocket', () => {
expect(pendingRequests.size).toBe(0);
});
});
describe('probeGatewayReady', () => {
beforeEach(() => {
vi.useFakeTimers();
wsState.sockets.length = 0;
});
afterEach(() => {
vi.useRealTimers();
vi.clearAllMocks();
wsState.sockets.length = 0;
});
it('resolves true when connect.challenge message is received', async () => {
const probePromise = probeGatewayReady(18789, 5000);
const socket = getLatestSocket();
socket.emitOpen();
socket.emitJsonMessage({
type: 'event',
event: 'connect.challenge',
payload: { nonce: 'probe-nonce' },
});
await expect(probePromise).resolves.toBe(true);
expect(socket.terminate).toHaveBeenCalled();
});
it('resolves false on WebSocket error', async () => {
const probePromise = probeGatewayReady(18789, 5000);
const socket = getLatestSocket();
socket.emit('error', new Error('ECONNREFUSED'));
await expect(probePromise).resolves.toBe(false);
expect(socket.terminate).toHaveBeenCalled();
});
it('resolves false on timeout when no message is received', async () => {
const probePromise = probeGatewayReady(18789, 2000);
const socket = getLatestSocket();
socket.emitOpen();
// No message sent — just advance time past timeout
await vi.advanceTimersByTimeAsync(2001);
await expect(probePromise).resolves.toBe(false);
expect(socket.terminate).toHaveBeenCalled();
});
it('resolves false when socket closes before challenge', async () => {
const probePromise = probeGatewayReady(18789, 5000);
const socket = getLatestSocket();
socket.emitOpen();
// Emit close directly (not through the mock's close() method)
socket.emit('close', 1006, Buffer.from(''));
await expect(probePromise).resolves.toBe(false);
expect(socket.terminate).toHaveBeenCalled();
});
it('does NOT resolve true on plain open event (key behavioral change)', async () => {
const probePromise = probeGatewayReady(18789, 500);
const socket = getLatestSocket();
// Only emit open — no connect.challenge message
socket.emitOpen();
// The old implementation would have resolved true here.
// The new implementation waits for connect.challenge.
await vi.advanceTimersByTimeAsync(501);
await expect(probePromise).resolves.toBe(false);
});
it('uses terminate() instead of close() for cleanup to avoid Windows TIME_WAIT', async () => {
const probePromise = probeGatewayReady(18789, 5000);
const socket = getLatestSocket();
socket.emitOpen();
socket.emitJsonMessage({
type: 'event',
event: 'connect.challenge',
payload: { nonce: 'nonce-1' },
});
await expect(probePromise).resolves.toBe(true);
// Must use terminate(), not close()
expect(socket.terminate).toHaveBeenCalledTimes(1);
expect(socket.close).not.toHaveBeenCalled();
});
it('ignores non-challenge messages', async () => {
const probePromise = probeGatewayReady(18789, 1000);
const socket = getLatestSocket();
socket.emitOpen();
// Send a message that is NOT connect.challenge
socket.emitJsonMessage({
type: 'event',
event: 'some.other.event',
payload: {},
});
// Should still be waiting — not resolved yet
await vi.advanceTimersByTimeAsync(1001);
await expect(probePromise).resolves.toBe(false);
});
it('ignores malformed JSON messages', async () => {
const probePromise = probeGatewayReady(18789, 1000);
const socket = getLatestSocket();
socket.emitOpen();
// Send raw invalid JSON
socket.emit('message', Buffer.from('not-json'));
await vi.advanceTimersByTimeAsync(1001);
await expect(probePromise).resolves.toBe(false);
});
});