fix(processes): fix multiple clawx processes running concurently (#589)

Co-authored-by: Cursor Agent <cursoragent@cursor.com>
Co-authored-by: Haze <hazeone@users.noreply.github.com>
Co-authored-by: paisley <8197966+su8su@users.noreply.github.com>
Co-authored-by: Felix <24791380+vcfgv@users.noreply.github.com>
This commit is contained in:
Haze
2026-03-20 18:34:20 +08:00
committed by GitHub
Unverified
parent 016ebb2b7b
commit 9b503b531b
15 changed files with 844 additions and 26 deletions

View File

@@ -36,7 +36,7 @@ const execAsync = promisify(exec);
* stale bot connections is to kill the Gateway process entirely and
* spawn a fresh one that reads the updated openclaw.json from scratch.
*/
async function restartGatewayForAgentDeletion(ctx: HostApiContext): Promise<void> {
export async function restartGatewayForAgentDeletion(ctx: HostApiContext): Promise<void> {
try {
// Capture the PID of the running Gateway BEFORE stop() clears it.
const status = ctx.gatewayManager.getStatus();
@@ -50,10 +50,14 @@ async function restartGatewayForAgentDeletion(ctx: HostApiContext): Promise<void
// and the old process stays alive with its stale channel connections.
if (pid) {
try {
process.kill(pid, 'SIGTERM');
// Give it a moment to die
await new Promise((resolve) => setTimeout(resolve, 500));
try { process.kill(pid, 0); process.kill(pid, 'SIGKILL'); } catch { /* already dead */ }
if (process.platform === 'win32') {
await execAsync(`taskkill /F /PID ${pid} /T`);
} else {
process.kill(pid, 'SIGTERM');
// Give it a moment to die
await new Promise((resolve) => setTimeout(resolve, 500));
try { process.kill(pid, 0); process.kill(pid, 'SIGKILL'); } catch { /* already dead */ }
}
} catch {
// process already gone that's fine
}
@@ -85,7 +89,7 @@ async function restartGatewayForAgentDeletion(ctx: HostApiContext): Promise<void
}
}
for (const p of pids) {
try { await execAsync(`taskkill /F /PID ${p}`); } catch { /* ignore */ }
try { await execAsync(`taskkill /F /PID ${p} /T`); } catch { /* ignore */ }
}
}
} catch {

View File

@@ -242,6 +242,7 @@ export class GatewayManager extends EventEmitter {
await this.connect(port, externalToken);
},
onConnectedToExistingGateway: () => {
// If the existing gateway is actually our own spawned UtilityProcess
// (e.g. after a self-restart code=1012), keep ownership so that
// stop() can still terminate the process during a restart() cycle.
@@ -250,6 +251,7 @@ export class GatewayManager extends EventEmitter {
this.ownsProcess = false;
this.setStatus({ pid: undefined });
}
this.startHealthCheck();
},
waitForPortFree: async (port) => {
@@ -356,6 +358,25 @@ export class GatewayManager extends EventEmitter {
this.setStatus({ state: 'stopped', error: undefined, pid: undefined, connectedAt: undefined, uptime: undefined });
}
/**
* Best-effort emergency cleanup for app-quit timeout paths.
* Only terminates a process this manager still owns.
*/
async forceTerminateOwnedProcessForQuit(): Promise<boolean> {
if (!this.process || !this.ownsProcess) {
return false;
}
const child = this.process;
await terminateOwnedGatewayProcess(child);
if (this.process === child) {
this.process = null;
}
this.ownsProcess = false;
this.setStatus({ pid: undefined });
return true;
}
/**
* Restart Gateway process
*/
@@ -724,6 +745,7 @@ export class GatewayManager extends EventEmitter {
this.process = child;
this.ownsProcess = true;
logger.debug(`Gateway manager now owns process pid=${child.pid ?? 'unknown'}`);
this.lastSpawnSummary = lastSpawnSummary;
}

View File

@@ -22,39 +22,58 @@ export function warmupManagedPythonReadiness(): void {
}
export async function terminateOwnedGatewayProcess(child: Electron.UtilityProcess): Promise<void> {
let exited = false;
const terminateWindowsProcessTree = async (pid: number): Promise<void> => {
const cp = await import('child_process');
await new Promise<void>((resolve) => {
cp.exec(`taskkill /F /PID ${pid} /T`, { timeout: 5000, windowsHide: true }, () => resolve());
});
};
await new Promise<void>((resolve) => {
let exited = false;
// Register a single exit listener before any kill attempt to avoid
// the race where exit fires between two separate `once('exit')` calls.
child.once('exit', () => {
exited = true;
clearTimeout(timeout);
resolve();
});
const pid = child.pid;
logger.info(`Sending kill to Gateway process (pid=${pid ?? 'unknown'})`);
try {
child.kill();
} catch {
// ignore if already exited
if (process.platform === 'win32' && pid) {
void terminateWindowsProcessTree(pid).catch((err) => {
logger.warn(`Windows process-tree kill failed for Gateway pid=${pid}:`, err);
});
} else {
try {
child.kill();
} catch {
// ignore if already exited
}
}
const timeout = setTimeout(() => {
if (!exited) {
logger.warn(`Gateway did not exit in time, force-killing (pid=${pid ?? 'unknown'})`);
if (pid) {
try {
process.kill(pid, 'SIGKILL');
} catch {
// ignore
if (process.platform === 'win32') {
void terminateWindowsProcessTree(pid).catch((err) => {
logger.warn(`Forced Windows process-tree kill failed for Gateway pid=${pid}:`, err);
});
} else {
try {
process.kill(pid, 'SIGKILL');
} catch {
// ignore
}
}
}
}
resolve();
}, 5000);
child.once('exit', () => {
clearTimeout(timeout);
});
});
}
@@ -226,6 +245,9 @@ export async function findExistingGatewayProcess(options: {
const pids = await getListeningProcessIds(port);
if (pids.length > 0 && (!ownedPid || !pids.includes(String(ownedPid)))) {
await terminateOrphanedProcessIds(port, pids);
if (process.platform === 'win32') {
await waitForPortFree(port, 10000);
}
return null;
}
} catch (err) {

View File

@@ -27,6 +27,13 @@ import {
createMainWindowFocusState,
requestSecondInstanceFocus,
} from './main-window-focus';
import {
createQuitLifecycleState,
markQuitCleanupCompleted,
requestQuitLifecycleAction,
} from './quit-lifecycle';
import { createSignalQuitHandler } from './signal-quit';
import { acquireProcessInstanceFileLock } from './process-instance-lock';
import { getSetting } from '../utils/store';
import { ensureBuiltinSkillsInstalled, ensurePreinstalledSkillsInstalled } from '../utils/skill-config';
import { ensureAllBundledPluginsInstalled } from '../utils/plugin-install';
@@ -68,10 +75,37 @@ if (process.platform === 'linux') {
// same port, then each treats the other's gateway as "orphaned" and kills
// it — creating an infinite kill/restart loop on Windows.
// The losing process must exit immediately so it never reaches Gateway startup.
const gotTheLock = app.requestSingleInstanceLock();
if (!gotTheLock) {
const gotElectronLock = app.requestSingleInstanceLock();
if (!gotElectronLock) {
console.info('[ClawX] Another instance already holds the single-instance lock; exiting duplicate process');
app.exit(0);
}
let releaseProcessInstanceFileLock: () => void = () => {};
let gotFileLock = true;
if (gotElectronLock) {
try {
const fileLock = acquireProcessInstanceFileLock({
userDataDir: app.getPath('userData'),
lockName: 'clawx',
});
gotFileLock = fileLock.acquired;
releaseProcessInstanceFileLock = fileLock.release;
if (!fileLock.acquired) {
const ownerDescriptor = fileLock.ownerPid
? `${fileLock.ownerFormat ?? 'legacy'} pid=${fileLock.ownerPid}`
: fileLock.ownerFormat === 'unknown'
? 'unknown lock format/content'
: 'unknown owner';
console.info(
`[ClawX] Another instance already holds process lock (${fileLock.lockPath}, ${ownerDescriptor}); exiting duplicate process`,
);
app.exit(0);
}
} catch (error) {
console.warn('[ClawX] Failed to acquire process instance file lock; continuing with Electron single-instance lock only', error);
}
}
const gotTheLock = gotElectronLock && gotFileLock;
// Global references
let mainWindow: BrowserWindow | null = null;
@@ -80,6 +114,7 @@ let clawHubService!: ClawHubService;
let hostEventBus!: HostEventBus;
let hostApiServer: Server | null = null;
const mainWindowFocusState = createMainWindowFocusState();
const quitLifecycleState = createQuitLifecycleState();
/**
* Resolve the icons directory path (works in both dev and packaged mode)
@@ -216,7 +251,7 @@ async function initialize(): Promise<void> {
logger.init();
logger.info('=== ClawX Application Starting ===');
logger.debug(
`Runtime: platform=${process.platform}/${process.arch}, electron=${process.versions.electron}, node=${process.versions.node}, packaged=${app.isPackaged}`
`Runtime: platform=${process.platform}/${process.arch}, electron=${process.versions.electron}, node=${process.versions.node}, packaged=${app.isPackaged}, pid=${process.pid}, ppid=${process.ppid}`
);
// Warm up network optimization (non-blocking)
@@ -413,6 +448,22 @@ async function initialize(): Promise<void> {
}
if (gotTheLock) {
const requestQuitOnSignal = createSignalQuitHandler({
logInfo: (message) => logger.info(message),
requestQuit: () => app.quit(),
});
process.on('exit', () => {
releaseProcessInstanceFileLock();
});
process.once('SIGINT', () => requestQuitOnSignal('SIGINT'));
process.once('SIGTERM', () => requestQuitOnSignal('SIGTERM'));
app.on('will-quit', () => {
releaseProcessInstanceFileLock();
});
if (process.platform === 'win32') {
app.setAppUserModelId(WINDOWS_APP_USER_MODEL_ID);
}
@@ -461,15 +512,69 @@ if (gotTheLock) {
}
});
app.on('before-quit', () => {
app.on('before-quit', (event) => {
setQuitting();
const action = requestQuitLifecycleAction(quitLifecycleState);
if (action === 'allow-quit') {
return;
}
event.preventDefault();
if (action === 'cleanup-in-progress') {
logger.debug('Quit requested while cleanup already in progress; waiting for shutdown task to finish');
return;
}
hostEventBus.closeAll();
hostApiServer?.close();
// Fire-and-forget: do not await gatewayManager.stop() here.
// Awaiting inside before-quit can stall Electron's quit sequence.
void gatewayManager.stop().catch((err) => {
const stopPromise = gatewayManager.stop().catch((err) => {
logger.warn('gatewayManager.stop() error during quit:', err);
});
const timeoutPromise = new Promise<'timeout'>((resolve) => {
setTimeout(() => resolve('timeout'), 5000);
});
void Promise.race([stopPromise.then(() => 'stopped' as const), timeoutPromise]).then((result) => {
if (result === 'timeout') {
logger.warn('Gateway shutdown timed out during app quit; proceeding with forced quit');
void gatewayManager.forceTerminateOwnedProcessForQuit().then((terminated) => {
if (terminated) {
logger.warn('Forced gateway process termination completed after quit timeout');
}
}).catch((err) => {
logger.warn('Forced gateway termination failed after quit timeout:', err);
});
}
markQuitCleanupCompleted(quitLifecycleState);
app.quit();
});
});
// Best-effort Gateway cleanup on unexpected crashes.
// These handlers attempt to terminate the Gateway child process within a
// short timeout before force-exiting, preventing orphaned processes.
const emergencyGatewayCleanup = (reason: string, error: unknown): void => {
logger.error(`${reason}:`, error);
try {
void gatewayManager?.stop().catch(() => { /* ignore */ });
} catch {
// ignore — stop() may not be callable if state is corrupted
}
// Give Gateway stop a brief window, then force-exit.
setTimeout(() => {
process.exit(1);
}, 3000).unref();
};
process.on('uncaughtException', (error) => {
emergencyGatewayCleanup('Uncaught exception in main process', error);
});
process.on('unhandledRejection', (reason) => {
emergencyGatewayCleanup('Unhandled promise rejection in main process', reason);
});
}

View File

@@ -0,0 +1,181 @@
import { closeSync, existsSync, mkdirSync, openSync, readFileSync, rmSync, writeFileSync } from 'node:fs';
import { join } from 'node:path';
const LOCK_SCHEMA = 'clawx-instance-lock';
const LOCK_VERSION = 1;
export interface ProcessInstanceFileLock {
acquired: boolean;
lockPath: string;
ownerPid?: number;
ownerFormat?: 'legacy' | 'structured' | 'unknown';
release: () => void;
}
export interface ProcessInstanceFileLockOptions {
userDataDir: string;
lockName: string;
pid?: number;
isPidAlive?: (pid: number) => boolean;
}
function defaultPidAlive(pid: number): boolean {
try {
process.kill(pid, 0);
return true;
} catch (error) {
const errno = (error as NodeJS.ErrnoException).code;
return errno !== 'ESRCH';
}
}
type ParsedLockOwner =
| { kind: 'legacy'; pid: number }
| { kind: 'structured'; pid: number }
| { kind: 'unknown' };
interface StructuredLockContent {
schema: string;
version: number;
pid: number;
}
function parsePositivePid(raw: string): number | undefined {
if (!/^\d+$/.test(raw)) {
return undefined;
}
const parsed = Number.parseInt(raw, 10);
if (!Number.isFinite(parsed) || parsed <= 0) {
return undefined;
}
return parsed;
}
function parseStructuredLockContent(raw: string): StructuredLockContent | undefined {
try {
const parsed = JSON.parse(raw) as Partial<StructuredLockContent>;
if (
parsed?.schema === LOCK_SCHEMA
&& parsed?.version === LOCK_VERSION
&& typeof parsed?.pid === 'number'
&& Number.isFinite(parsed.pid)
&& parsed.pid > 0
) {
return {
schema: parsed.schema,
version: parsed.version,
pid: parsed.pid,
};
}
} catch {
// ignore parse errors
}
return undefined;
}
function readLockOwner(lockPath: string): ParsedLockOwner {
try {
const raw = readFileSync(lockPath, 'utf8').trim();
const legacyPid = parsePositivePid(raw);
if (legacyPid !== undefined) {
return { kind: 'legacy', pid: legacyPid };
}
const structured = parseStructuredLockContent(raw);
if (structured) {
return { kind: 'structured', pid: structured.pid };
}
} catch {
// ignore read errors
}
return { kind: 'unknown' };
}
export function acquireProcessInstanceFileLock(
options: ProcessInstanceFileLockOptions,
): ProcessInstanceFileLock {
const pid = options.pid ?? process.pid;
const isPidAlive = options.isPidAlive ?? defaultPidAlive;
mkdirSync(options.userDataDir, { recursive: true });
const lockPath = join(options.userDataDir, `${options.lockName}.instance.lock`);
let ownerPid: number | undefined;
let ownerFormat: ProcessInstanceFileLock['ownerFormat'] = 'unknown';
for (let attempt = 0; attempt < 2; attempt += 1) {
try {
const fd = openSync(lockPath, 'wx');
try {
// Keep writing legacy numeric format for broad backward compatibility.
// Parser accepts both legacy numeric and structured JSON formats.
writeFileSync(fd, String(pid), 'utf8');
} finally {
closeSync(fd);
}
let released = false;
return {
acquired: true,
lockPath,
release: () => {
if (released) return;
released = true;
try {
const currentOwner = readLockOwner(lockPath);
if (
(currentOwner.kind === 'legacy' || currentOwner.kind === 'structured')
&& currentOwner.pid !== pid
) {
return;
}
if (currentOwner.kind === 'unknown') {
return;
}
rmSync(lockPath, { force: true });
} catch {
// best-effort
}
},
};
} catch (error) {
const errno = (error as NodeJS.ErrnoException).code;
if (errno !== 'EEXIST') {
break;
}
const owner = readLockOwner(lockPath);
if (owner.kind === 'legacy' || owner.kind === 'structured') {
ownerPid = owner.pid;
ownerFormat = owner.kind;
} else {
ownerPid = undefined;
ownerFormat = 'unknown';
}
const shouldTreatAsStale =
(owner.kind === 'legacy' || owner.kind === 'structured')
&& !isPidAlive(owner.pid);
if (shouldTreatAsStale && existsSync(lockPath)) {
try {
rmSync(lockPath, { force: true });
continue;
} catch {
// If deletion fails, treat as held lock.
}
}
break;
}
}
return {
acquired: false,
lockPath,
ownerPid,
ownerFormat,
release: () => {
// no-op when lock wasn't acquired
},
};
}

View File

@@ -0,0 +1,30 @@
export interface QuitLifecycleState {
cleanupStarted: boolean;
cleanupCompleted: boolean;
}
export type QuitLifecycleAction = 'start-cleanup' | 'cleanup-in-progress' | 'allow-quit';
export function createQuitLifecycleState(): QuitLifecycleState {
return {
cleanupStarted: false,
cleanupCompleted: false,
};
}
export function requestQuitLifecycleAction(state: QuitLifecycleState): QuitLifecycleAction {
if (state.cleanupCompleted) {
return 'allow-quit';
}
if (state.cleanupStarted) {
return 'cleanup-in-progress';
}
state.cleanupStarted = true;
return 'start-cleanup';
}
export function markQuitCleanupCompleted(state: QuitLifecycleState): void {
state.cleanupCompleted = true;
}

View File

@@ -0,0 +1,11 @@
export interface SignalQuitHandlerHooks {
logInfo: (message: string) => void;
requestQuit: () => void;
}
export function createSignalQuitHandler(hooks: SignalQuitHandlerHooks): (signal: NodeJS.Signals) => void {
return (signal: NodeJS.Signals) => {
hooks.logInfo(`Received ${signal}; requesting app quit`);
hooks.requestQuit();
};
}