fix(gateway): handle Windows OpenClaw process exit error during in-process restarts (#794)

Co-authored-by: Cursor Agent <cursoragent@cursor.com> Co-authored-by: Haze <hazeone@users.noreply.github.com>
2026-04-08 12:06:12 +08:00
parent 3021ad5089
commit 32d14b8cf9
8 changed files with 687 additions and 24 deletions
--- a/electron/gateway/manager.ts
+++ b/electron/gateway/manager.ts
@@ -242,6 +242,7 @@ export class GatewayManager extends EventEmitter {
        port: this.status.port,
        ownedPid: this.process?.pid,
        shouldWaitForPortFree: process.platform === 'win32',
+        hasOwnedProcess: () => this.process?.pid != null && this.ownsProcess,
        resetStartupStderrLines: () => {
          this.recentStartupStderrLines = [];
        },
@@ -446,7 +447,16 @@ export class GatewayManager extends EventEmitter {
    logger.info(`[gateway-refresh] mode=restart requested pidBefore=${pidBefore ?? 'n/a'}`);
    this.restartInFlight = (async () => {
      await this.stop();
-      await this.start();
+      try {
+        await this.start();
+      } catch (err) {
+        // stop() set shouldReconnect=false. Restore it so the gateway
+        // can self-heal via scheduleReconnect() instead of dying permanently.
+        logger.warn('Gateway restart: start() failed after stop(), enabling auto-reconnect recovery', err);
+        this.shouldReconnect = true;
+        this.scheduleReconnect();
+        throw err;
+      }
    })();

    try {
--- a/electron/gateway/startup-orchestrator.ts
+++ b/electron/gateway/startup-orchestrator.ts
@@ -12,6 +12,8 @@ type StartupHooks = {
  ownedPid?: never; // Removed: pid is now read dynamically in findExistingGateway to avoid stale-snapshot bug
  shouldWaitForPortFree: boolean;
  maxStartAttempts?: number;
+  /** Returns true when the manager still owns a living Gateway process (e.g. after a code-1012 in-process restart). */
+  hasOwnedProcess: () => boolean;
  resetStartupStderrLines: () => void;
  getStartupStderrLines: () => string[];
  assertLifecycle: (phase: string) => void;
@@ -49,6 +51,22 @@ export async function runGatewayStartupSequence(hooks: StartupHooks): Promise<vo
        return;
      }

+      // When the Gateway did an in-process restart (WS close 1012), the
+      // UtilityProcess is still alive but its WS server may be mid-rebuild,
+      // so findExistingGateway's quick probe returns null.  Rather than
+      // waiting for the port to free (it never will — the process holds it)
+      // and then spawning a duplicate, wait for the existing process to
+      // become ready and reconnect to it.
+      if (hooks.hasOwnedProcess()) {
+        logger.info('Owned Gateway process still alive (likely in-process restart); waiting for it to become ready');
+        await hooks.waitForReady(hooks.port);
+        hooks.assertLifecycle('start/wait-ready-owned');
+        await hooks.connect(hooks.port);
+        hooks.assertLifecycle('start/connect-owned');
+        hooks.onConnectedToExistingGateway();
+        return;
+      }
+
      logger.debug('No existing Gateway found, starting new process...');

      if (hooks.shouldWaitForPortFree) {
--- a/electron/gateway/supervisor.ts
+++ b/electron/gateway/supervisor.ts
@@ -1,12 +1,12 @@
 import { app, utilityProcess } from 'electron';
 import path from 'path';
 import { existsSync } from 'fs';
-import WebSocket from 'ws';
 import { getOpenClawDir, getOpenClawEntryPath } from '../utils/paths';
 import { getUvMirrorEnv } from '../utils/uv-env';
 import { isPythonReady, setupManagedPython } from '../utils/uv-setup';
 import { logger } from '../utils/logger';
 import { prependPathEntry } from '../utils/env-path';
+import { probeGatewayReady } from './ws-client';

 export function warmupManagedPythonReadiness(): void {
  void isPythonReady().then((pythonReady) => {
@@ -255,27 +255,8 @@ export async function findExistingGatewayProcess(options: {
      logger.warn('Error checking for existing process on port:', err);
    }

-    return await new Promise<{ port: number; externalToken?: string } | null>((resolve) => {
-      const testWs = new WebSocket(`ws://localhost:${port}/ws`);
-      const terminateAndResolve = (result: { port: number; externalToken?: string } | null) => {
-        // terminate() avoids TIME_WAIT on Windows (vs close() which does WS handshake)
-        try { testWs.terminate(); } catch { /* ignore */ }
-        resolve(result);
-      };
-      const timeout = setTimeout(() => {
-        terminateAndResolve(null);
-      }, 2000);
-
-      testWs.on('open', () => {
-        clearTimeout(timeout);
-        terminateAndResolve({ port });
-      });
-
-      testWs.on('error', () => {
-        clearTimeout(timeout);
-        resolve(null);
-      });
-    });
+    const ready = await probeGatewayReady(port, 5000);
+    return ready ? { port } : null;
  } catch {
    return null;
  }
--- a/src/pages/Channels/index.tsx
+++ b/src/pages/Channels/index.tsx
@@ -134,7 +134,7 @@ export function Channels() {
      setLoading(false);
    }
  // Stable reference — reads state via refs, no deps needed.
-  // eslint-disable-next-line react-hooks/exhaustive-deps
+   
  }, []);

  useEffect(() => {
--- a/tests/e2e/gateway-lifecycle.spec.ts
+++ b/tests/e2e/gateway-lifecycle.spec.ts
@@ -0,0 +1,146 @@
+import { completeSetup, expect, installIpcMocks, test } from './fixtures/electron';
+
+test.describe('ClawX gateway lifecycle resilience', () => {
+  test('app remains fully navigable while gateway is disconnected', async ({ page }) => {
+    // In E2E mode, gateway auto-start is skipped, so the app starts
+    // with gateway in "stopped" state — simulating the disconnected scenario.
+    await completeSetup(page);
+
+    // Navigate through all major pages to verify nothing crashes
+    // when the gateway is not running.
+    await page.getByTestId('sidebar-nav-models').click();
+    await expect(page.getByTestId('models-page')).toBeVisible();
+
+    await page.getByTestId('sidebar-nav-agents').click();
+    await expect(page.getByTestId('agents-page')).toBeVisible();
+
+    await page.getByTestId('sidebar-nav-channels').click();
+    await expect(page.getByTestId('channels-page')).toBeVisible();
+
+    await page.getByTestId('sidebar-nav-settings').click();
+    await expect(page.getByTestId('settings-page')).toBeVisible();
+
+    // Navigate back to chat — the gateway status indicator should be visible
+    await page.getByTestId('sidebar-new-chat').click();
+    // Verify the page didn't crash; main layout should still be stable
+    await expect(page.getByTestId('main-layout')).toBeVisible();
+  });
+
+  test('gateway status indicator updates when status transitions occur', async ({ electronApp, page }) => {
+    await completeSetup(page);
+
+    // Mock the initial gateway status as "stopped"
+    await installIpcMocks(electronApp, {
+      gatewayStatus: { state: 'stopped', port: 18789 },
+    });
+
+    // Simulate gateway status transitions by sending IPC events to the renderer.
+    // This mimics the main process emitting gateway:status-changed events.
+
+    // Transition 1: stopped → starting
+    await electronApp.evaluate(({ BrowserWindow }) => {
+      const win = BrowserWindow.getAllWindows()[0];
+      win?.webContents.send('gateway:status-changed', {
+        state: 'starting',
+        port: 18789,
+      });
+    });
+    // Wait briefly for the renderer to process the IPC event
+    await page.waitForTimeout(500);
+
+    // Transition 2: starting → running
+    await electronApp.evaluate(({ BrowserWindow }) => {
+      const win = BrowserWindow.getAllWindows()[0];
+      win?.webContents.send('gateway:status-changed', {
+        state: 'running',
+        port: 18789,
+        pid: 12345,
+        connectedAt: Date.now(),
+      });
+    });
+    await page.waitForTimeout(500);
+
+    // Verify navigation still works after status transitions
+    await page.getByTestId('sidebar-nav-models').click();
+    await expect(page.getByTestId('models-page')).toBeVisible();
+
+    // Transition 3: running → error (simulates the bug scenario where
+    // gateway becomes unreachable after in-process restart)
+    await electronApp.evaluate(({ BrowserWindow }) => {
+      const win = BrowserWindow.getAllWindows()[0];
+      win?.webContents.send('gateway:status-changed', {
+        state: 'error',
+        port: 18789,
+        error: 'WebSocket closed before handshake',
+      });
+    });
+    await page.waitForTimeout(500);
+
+    // App should still be functional in error state
+    await page.getByTestId('sidebar-nav-agents').click();
+    await expect(page.getByTestId('agents-page')).toBeVisible();
+
+    // Transition 4: error → reconnecting → running (the recovery path)
+    await electronApp.evaluate(({ BrowserWindow }) => {
+      const win = BrowserWindow.getAllWindows()[0];
+      win?.webContents.send('gateway:status-changed', {
+        state: 'reconnecting',
+        port: 18789,
+        reconnectAttempts: 1,
+      });
+    });
+    await page.waitForTimeout(300);
+
+    await electronApp.evaluate(({ BrowserWindow }) => {
+      const win = BrowserWindow.getAllWindows()[0];
+      win?.webContents.send('gateway:status-changed', {
+        state: 'running',
+        port: 18789,
+        pid: 23456,
+        connectedAt: Date.now(),
+      });
+    });
+    await page.waitForTimeout(500);
+
+    // Final navigation check to confirm app is still healthy after full lifecycle
+    await page.getByTestId('sidebar-nav-settings').click();
+    await expect(page.getByTestId('settings-page')).toBeVisible();
+    await page.getByTestId('sidebar-new-chat').click();
+    await expect(page.getByTestId('main-layout')).toBeVisible();
+  });
+
+  test('app handles rapid gateway status transitions without crashing', async ({ electronApp, page }) => {
+    await completeSetup(page);
+
+    // Simulate rapid status transitions like those seen in the bug log:
+    // running → stopped → starting → error → reconnecting → running
+    const states = [
+      { state: 'running', port: 18789, pid: 100 },
+      { state: 'stopped', port: 18789 },
+      { state: 'starting', port: 18789 },
+      { state: 'error', port: 18789, error: 'Port 18789 still occupied after 30000ms' },
+      { state: 'reconnecting', port: 18789, reconnectAttempts: 1 },
+      { state: 'starting', port: 18789 },
+      { state: 'running', port: 18789, pid: 200, connectedAt: Date.now() },
+    ];
+
+    for (const status of states) {
+      await electronApp.evaluate(({ BrowserWindow }, s) => {
+        const win = BrowserWindow.getAllWindows()[0];
+        win?.webContents.send('gateway:status-changed', s);
+      }, status);
+      // Small delay between transitions to be more realistic
+      await page.waitForTimeout(100);
+    }
+
+    // Verify the app is still stable after rapid transitions
+    await expect(page.getByTestId('main-layout')).toBeVisible();
+
+    // Navigate to verify no page is in a broken state
+    await page.getByTestId('sidebar-nav-models').click();
+    await expect(page.getByTestId('models-page')).toBeVisible();
+
+    await page.getByTestId('sidebar-nav-channels').click();
+    await expect(page.getByTestId('channels-page')).toBeVisible();
+  });
+});
--- a/tests/unit/gateway-manager-restart-recovery.test.ts
+++ b/tests/unit/gateway-manager-restart-recovery.test.ts
@@ -0,0 +1,122 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('electron', () => ({
+  app: {
+    getPath: () => '/tmp',
+    isPackaged: false,
+  },
+  utilityProcess: {
+    fork: vi.fn(),
+  },
+}));
+
+vi.mock('@electron/utils/logger', () => ({
+  logger: {
+    info: vi.fn(),
+    warn: vi.fn(),
+    error: vi.fn(),
+    debug: vi.fn(),
+  },
+}));
+
+describe('GatewayManager restart recovery', () => {
+  beforeEach(() => {
+    vi.resetModules();
+    vi.clearAllMocks();
+    vi.useFakeTimers();
+    vi.setSystemTime(new Date('2026-04-08T00:00:00.000Z'));
+  });
+
+  it('re-enables auto-reconnect when start() fails during restart', async () => {
+    const { GatewayManager } = await import('@electron/gateway/manager');
+    const manager = new GatewayManager();
+
+    // Expose private members for testing
+    const internals = manager as unknown as {
+      shouldReconnect: boolean;
+      status: { state: string; port: number };
+      startLock: boolean;
+      reconnectTimer: NodeJS.Timeout | null;
+      restartInFlight: Promise<void> | null;
+      scheduleReconnect: () => void;
+      stop: () => Promise<void>;
+      start: () => Promise<void>;
+    };
+
+    // Set the manager into a state where restart can proceed:
+    // - state must not be 'starting' or 'reconnecting' (would defer restart)
+    // - startLock must be false
+    internals.status = { state: 'running', port: 18789 };
+    internals.startLock = false;
+    internals.shouldReconnect = true;
+
+    // Mock stop to just reset flags (simulates normal stop)
+    vi.spyOn(manager, 'stop').mockImplementation(async () => {
+      internals.shouldReconnect = false;
+      internals.status = { state: 'stopped', port: 18789 };
+    });
+
+    // Mock start to fail (simulates the race condition where gateway
+    // is reachable but not attachable after in-process restart)
+    vi.spyOn(manager, 'start').mockRejectedValue(
+      new Error('WebSocket closed before handshake: unknown'),
+    );
+
+    // Spy on scheduleReconnect
+    const scheduleReconnectSpy = vi.spyOn(
+      internals as unknown as { scheduleReconnect: () => void },
+      'scheduleReconnect',
+    );
+
+    // Perform the restart - it should throw because start() fails
+    await expect(manager.restart()).rejects.toThrow(
+      'WebSocket closed before handshake: unknown',
+    );
+
+    // KEY ASSERTION: After start() fails in restart(), shouldReconnect
+    // must be re-enabled so the gateway can self-heal
+    expect(internals.shouldReconnect).toBe(true);
+    expect(scheduleReconnectSpy).toHaveBeenCalled();
+  });
+
+  it('does not schedule extra reconnect when restart succeeds', async () => {
+    const { GatewayManager } = await import('@electron/gateway/manager');
+    const manager = new GatewayManager();
+
+    const internals = manager as unknown as {
+      shouldReconnect: boolean;
+      status: { state: string; port: number };
+      startLock: boolean;
+      reconnectTimer: NodeJS.Timeout | null;
+      restartInFlight: Promise<void> | null;
+      scheduleReconnect: () => void;
+    };
+
+    internals.status = { state: 'running', port: 18789 };
+    internals.startLock = false;
+    internals.shouldReconnect = true;
+
+    // Mock stop to reset flags
+    vi.spyOn(manager, 'stop').mockImplementation(async () => {
+      internals.shouldReconnect = false;
+      internals.status = { state: 'stopped', port: 18789 };
+    });
+
+    // Mock start to succeed
+    vi.spyOn(manager, 'start').mockImplementation(async () => {
+      internals.shouldReconnect = true;
+      internals.status = { state: 'running', port: 18789 };
+    });
+
+    const scheduleReconnectSpy = vi.spyOn(
+      internals as unknown as { scheduleReconnect: () => void },
+      'scheduleReconnect',
+    );
+
+    await manager.restart();
+
+    // scheduleReconnect should NOT have been called by the catch block
+    // (it may be called from other paths, but not the restart-recovery catch)
+    expect(scheduleReconnectSpy).not.toHaveBeenCalled();
+  });
+});
--- a/tests/unit/gateway-startup-orchestrator.test.ts
+++ b/tests/unit/gateway-startup-orchestrator.test.ts
@@ -0,0 +1,259 @@
+import { beforeEach, describe, expect, it, vi } from 'vitest';
+
+vi.mock('electron', () => ({
+  app: {
+    getPath: () => '/tmp',
+    isPackaged: false,
+  },
+  utilityProcess: {},
+}));
+
+vi.mock('@electron/utils/logger', () => ({
+  logger: {
+    info: vi.fn(),
+    warn: vi.fn(),
+    error: vi.fn(),
+    debug: vi.fn(),
+  },
+}));
+
+import { runGatewayStartupSequence } from '@electron/gateway/startup-orchestrator';
+import { LifecycleSupersededError } from '@electron/gateway/lifecycle-controller';
+
+function createMockHooks(overrides: Partial<Parameters<typeof runGatewayStartupSequence>[0]> = {}) {
+  return {
+    port: 18789,
+    shouldWaitForPortFree: true,
+    hasOwnedProcess: vi.fn().mockReturnValue(false),
+    resetStartupStderrLines: vi.fn(),
+    getStartupStderrLines: vi.fn().mockReturnValue([]),
+    assertLifecycle: vi.fn(),
+    findExistingGateway: vi.fn().mockResolvedValue(null),
+    connect: vi.fn().mockResolvedValue(undefined),
+    onConnectedToExistingGateway: vi.fn(),
+    waitForPortFree: vi.fn().mockResolvedValue(undefined),
+    startProcess: vi.fn().mockResolvedValue(undefined),
+    waitForReady: vi.fn().mockResolvedValue(undefined),
+    onConnectedToManagedGateway: vi.fn(),
+    runDoctorRepair: vi.fn().mockResolvedValue(false),
+    onDoctorRepairSuccess: vi.fn(),
+    delay: vi.fn().mockResolvedValue(undefined),
+    ...overrides,
+  };
+}
+
+describe('runGatewayStartupSequence', () => {
+  beforeEach(() => {
+    vi.clearAllMocks();
+  });
+
+  it('connects to existing gateway when findExistingGateway returns a result', async () => {
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue({ port: 18789, externalToken: 'tok-ext' }),
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
+    expect(hooks.connect).toHaveBeenCalledWith(18789, 'tok-ext');
+    expect(hooks.onConnectedToExistingGateway).toHaveBeenCalledTimes(1);
+    expect(hooks.startProcess).not.toHaveBeenCalled();
+    expect(hooks.onConnectedToManagedGateway).not.toHaveBeenCalled();
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start');
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/find-existing');
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect-existing');
+  });
+
+  it('waits for owned process when hasOwnedProcess returns true (in-process restart path)', async () => {
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(true),
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
+    expect(hooks.hasOwnedProcess).toHaveBeenCalled();
+    expect(hooks.waitForReady).toHaveBeenCalledWith(18789);
+    expect(hooks.connect).toHaveBeenCalledWith(18789);
+    expect(hooks.onConnectedToExistingGateway).toHaveBeenCalledTimes(1);
+
+    // Must NOT start a new process or wait for port free
+    expect(hooks.startProcess).not.toHaveBeenCalled();
+    expect(hooks.waitForPortFree).not.toHaveBeenCalled();
+    expect(hooks.onConnectedToManagedGateway).not.toHaveBeenCalled();
+
+    // Verify lifecycle assertions for the owned-process path
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-ready-owned');
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect-owned');
+  });
+
+  it('starts new process when no existing gateway and no owned process', async () => {
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      shouldWaitForPortFree: true,
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    expect(hooks.findExistingGateway).toHaveBeenCalledWith(18789);
+    expect(hooks.hasOwnedProcess).toHaveBeenCalled();
+    expect(hooks.waitForPortFree).toHaveBeenCalledWith(18789);
+    expect(hooks.startProcess).toHaveBeenCalledTimes(1);
+    expect(hooks.waitForReady).toHaveBeenCalledWith(18789);
+    expect(hooks.connect).toHaveBeenCalledWith(18789);
+    expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
+    expect(hooks.onConnectedToExistingGateway).not.toHaveBeenCalled();
+
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-port');
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/start-process');
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/wait-ready');
+    expect(hooks.assertLifecycle).toHaveBeenCalledWith('start/connect');
+  });
+
+  it('skips waitForPortFree when shouldWaitForPortFree is false', async () => {
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      shouldWaitForPortFree: false,
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    expect(hooks.waitForPortFree).not.toHaveBeenCalled();
+    expect(hooks.startProcess).toHaveBeenCalledTimes(1);
+    expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
+  });
+
+  it('retries on transient WebSocket errors', async () => {
+    let callCount = 0;
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      connect: vi.fn().mockImplementation(async () => {
+        callCount++;
+        if (callCount === 1) {
+          throw new Error('WebSocket closed before handshake: unknown');
+        }
+      }),
+      maxStartAttempts: 3,
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    // First attempt fails with transient error, second succeeds
+    expect(hooks.connect).toHaveBeenCalledTimes(2);
+    expect(hooks.delay).toHaveBeenCalledWith(1000);
+    expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
+  });
+
+  it('runs doctor repair on config-invalid stderr signal', async () => {
+    let attemptNumber = 0;
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      startProcess: vi.fn().mockImplementation(async () => {
+        attemptNumber++;
+        if (attemptNumber === 1) {
+          throw new Error('Gateway process exited before becoming ready (code=1)');
+        }
+      }),
+      getStartupStderrLines: vi.fn().mockReturnValue([
+        'Config invalid',
+        'Run: openclaw doctor --fix',
+      ]),
+      runDoctorRepair: vi.fn().mockResolvedValue(true),
+      maxStartAttempts: 3,
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    expect(hooks.runDoctorRepair).toHaveBeenCalledTimes(1);
+    expect(hooks.onDoctorRepairSuccess).toHaveBeenCalledTimes(1);
+    expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
+  });
+
+  it('fails after max attempts with transient errors', async () => {
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      connect: vi.fn().mockRejectedValue(new Error('WebSocket closed before handshake: unknown')),
+      maxStartAttempts: 3,
+    });
+
+    await expect(runGatewayStartupSequence(hooks)).rejects.toThrow(
+      'WebSocket closed before handshake: unknown',
+    );
+
+    // Should have attempted 3 times total
+    expect(hooks.connect).toHaveBeenCalledTimes(3);
+    expect(hooks.delay).toHaveBeenCalledTimes(2); // delays between retries 1→2, 2→3
+  });
+
+  it('owned process path falls back to retry when waitForReady throws a transient error', async () => {
+    let waitForReadyCalls = 0;
+    let hasOwnedProcessCalls = 0;
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockImplementation(() => {
+        hasOwnedProcessCalls++;
+        // First attempt: owned process is still alive (in-process restart scenario)
+        // Second attempt: process exited, no longer owned
+        return hasOwnedProcessCalls === 1;
+      }),
+      waitForReady: vi.fn().mockImplementation(async () => {
+        waitForReadyCalls++;
+        if (waitForReadyCalls === 1) {
+          throw new Error('WebSocket closed before handshake: unknown');
+        }
+      }),
+      maxStartAttempts: 3,
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    // First attempt: owned-process path → waitForReady throws → retry
+    // Second attempt: not owned → normal start path → succeeds
+    expect(hasOwnedProcessCalls).toBe(2);
+    expect(hooks.startProcess).toHaveBeenCalledTimes(1);
+    expect(hooks.onConnectedToManagedGateway).toHaveBeenCalledTimes(1);
+    expect(hooks.delay).toHaveBeenCalledWith(1000);
+  });
+
+  it('re-throws LifecycleSupersededError without retry', async () => {
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      startProcess: vi.fn().mockRejectedValue(
+        new LifecycleSupersededError('Lifecycle superseded during start'),
+      ),
+    });
+
+    await expect(runGatewayStartupSequence(hooks)).rejects.toThrow(LifecycleSupersededError);
+
+    // Should NOT retry — only one attempt
+    expect(hooks.startProcess).toHaveBeenCalledTimes(1);
+    expect(hooks.delay).not.toHaveBeenCalled();
+  });
+
+  it('resets startup stderr lines on each attempt', async () => {
+    let callCount = 0;
+    const hooks = createMockHooks({
+      findExistingGateway: vi.fn().mockResolvedValue(null),
+      hasOwnedProcess: vi.fn().mockReturnValue(false),
+      connect: vi.fn().mockImplementation(async () => {
+        callCount++;
+        if (callCount === 1) {
+          throw new Error('WebSocket closed before handshake: unknown');
+        }
+      }),
+      maxStartAttempts: 3,
+    });
+
+    await runGatewayStartupSequence(hooks);
+
+    // resetStartupStderrLines should be called once per attempt
+    expect(hooks.resetStartupStderrLines).toHaveBeenCalledTimes(2);
+  });
+});
--- a/tests/unit/gateway-ws-client.test.ts
+++ b/tests/unit/gateway-ws-client.test.ts
@@ -12,6 +12,9 @@ const wsState = vi.hoisted(() => ({
        this.emit('close', code, Buffer.from(String(reason)));
      });
    });
+    readonly terminate = vi.fn(() => {
+      this.readyState = 3;
+    });
    readonly send = vi.fn((payload: string) => {
      this.sentFrames.push(payload);
    });
@@ -61,6 +64,7 @@ vi.mock('@electron/utils/logger', () => ({
 import {
  GATEWAY_CONNECT_HANDSHAKE_TIMEOUT_MS,
  connectGatewaySocket,
+  probeGatewayReady,
 } from '@electron/gateway/ws-client';

 async function flushMicrotasks(): Promise<void> {
@@ -182,3 +186,126 @@ describe('connectGatewaySocket', () => {
    expect(pendingRequests.size).toBe(0);
  });
 });
+
+describe('probeGatewayReady', () => {
+  beforeEach(() => {
+    vi.useFakeTimers();
+    wsState.sockets.length = 0;
+  });
+
+  afterEach(() => {
+    vi.useRealTimers();
+    vi.clearAllMocks();
+    wsState.sockets.length = 0;
+  });
+
+  it('resolves true when connect.challenge message is received', async () => {
+    const probePromise = probeGatewayReady(18789, 5000);
+    const socket = getLatestSocket();
+
+    socket.emitOpen();
+    socket.emitJsonMessage({
+      type: 'event',
+      event: 'connect.challenge',
+      payload: { nonce: 'probe-nonce' },
+    });
+
+    await expect(probePromise).resolves.toBe(true);
+    expect(socket.terminate).toHaveBeenCalled();
+  });
+
+  it('resolves false on WebSocket error', async () => {
+    const probePromise = probeGatewayReady(18789, 5000);
+    const socket = getLatestSocket();
+
+    socket.emit('error', new Error('ECONNREFUSED'));
+
+    await expect(probePromise).resolves.toBe(false);
+    expect(socket.terminate).toHaveBeenCalled();
+  });
+
+  it('resolves false on timeout when no message is received', async () => {
+    const probePromise = probeGatewayReady(18789, 2000);
+    const socket = getLatestSocket();
+
+    socket.emitOpen();
+    // No message sent — just advance time past timeout
+    await vi.advanceTimersByTimeAsync(2001);
+
+    await expect(probePromise).resolves.toBe(false);
+    expect(socket.terminate).toHaveBeenCalled();
+  });
+
+  it('resolves false when socket closes before challenge', async () => {
+    const probePromise = probeGatewayReady(18789, 5000);
+    const socket = getLatestSocket();
+
+    socket.emitOpen();
+    // Emit close directly (not through the mock's close() method)
+    socket.emit('close', 1006, Buffer.from(''));
+
+    await expect(probePromise).resolves.toBe(false);
+    expect(socket.terminate).toHaveBeenCalled();
+  });
+
+  it('does NOT resolve true on plain open event (key behavioral change)', async () => {
+    const probePromise = probeGatewayReady(18789, 500);
+    const socket = getLatestSocket();
+
+    // Only emit open — no connect.challenge message
+    socket.emitOpen();
+
+    // The old implementation would have resolved true here.
+    // The new implementation waits for connect.challenge.
+    await vi.advanceTimersByTimeAsync(501);
+
+    await expect(probePromise).resolves.toBe(false);
+  });
+
+  it('uses terminate() instead of close() for cleanup to avoid Windows TIME_WAIT', async () => {
+    const probePromise = probeGatewayReady(18789, 5000);
+    const socket = getLatestSocket();
+
+    socket.emitOpen();
+    socket.emitJsonMessage({
+      type: 'event',
+      event: 'connect.challenge',
+      payload: { nonce: 'nonce-1' },
+    });
+
+    await expect(probePromise).resolves.toBe(true);
+
+    // Must use terminate(), not close()
+    expect(socket.terminate).toHaveBeenCalledTimes(1);
+    expect(socket.close).not.toHaveBeenCalled();
+  });
+
+  it('ignores non-challenge messages', async () => {
+    const probePromise = probeGatewayReady(18789, 1000);
+    const socket = getLatestSocket();
+
+    socket.emitOpen();
+    // Send a message that is NOT connect.challenge
+    socket.emitJsonMessage({
+      type: 'event',
+      event: 'some.other.event',
+      payload: {},
+    });
+
+    // Should still be waiting — not resolved yet
+    await vi.advanceTimersByTimeAsync(1001);
+    await expect(probePromise).resolves.toBe(false);
+  });
+
+  it('ignores malformed JSON messages', async () => {
+    const probePromise = probeGatewayReady(18789, 1000);
+    const socket = getLatestSocket();
+
+    socket.emitOpen();
+    // Send raw invalid JSON
+    socket.emit('message', Buffer.from('not-json'));
+
+    await vi.advanceTimersByTimeAsync(1001);
+    await expect(probePromise).resolves.toBe(false);
+  });
+});