fix(gateway): improve process termination handling and add timeout (#153)
This commit is contained in:
@@ -310,13 +310,23 @@ export class GatewayManager extends EventEmitter {
|
|||||||
return resolve();
|
return resolve();
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.info(`Sending SIGTERM to Gateway (pid=${child.pid ?? 'unknown'})`);
|
// Kill the entire process group so respawned children are also terminated.
|
||||||
|
// The gateway entry script may respawn itself; killing only the parent PID
|
||||||
|
// leaves the child orphaned (PPID=1) and still holding the port.
|
||||||
|
const pid = child.pid;
|
||||||
|
logger.info(`Sending SIGTERM to Gateway process group (pid=${pid ?? 'unknown'})`);
|
||||||
|
if (pid) {
|
||||||
|
try { process.kill(-pid, 'SIGTERM'); } catch { /* group kill failed, fall back */ }
|
||||||
|
}
|
||||||
child.kill('SIGTERM');
|
child.kill('SIGTERM');
|
||||||
|
|
||||||
// Force kill after timeout
|
// Force kill after timeout
|
||||||
const timeout = setTimeout(() => {
|
const timeout = setTimeout(() => {
|
||||||
if (child.exitCode === null && child.signalCode === null) {
|
if (child.exitCode === null && child.signalCode === null) {
|
||||||
logger.warn(`Gateway did not exit in time, sending SIGKILL (pid=${child.pid ?? 'unknown'})`);
|
logger.warn(`Gateway did not exit in time, sending SIGKILL (pid=${pid ?? 'unknown'})`);
|
||||||
|
if (pid) {
|
||||||
|
try { process.kill(-pid, 'SIGKILL'); } catch { /* ignore */ }
|
||||||
|
}
|
||||||
child.kill('SIGKILL');
|
child.kill('SIGKILL');
|
||||||
}
|
}
|
||||||
resolve();
|
resolve();
|
||||||
@@ -471,37 +481,33 @@ export class GatewayManager extends EventEmitter {
|
|||||||
const port = PORTS.OPENCLAW_GATEWAY;
|
const port = PORTS.OPENCLAW_GATEWAY;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const { stdout } = await new Promise<{ stdout: string }>((resolve) => {
|
const { stdout } = await new Promise<{ stdout: string }>((resolve, reject) => {
|
||||||
import('child_process').then(cp => {
|
import('child_process').then(cp => {
|
||||||
cp.exec(`lsof -i :${port} | grep LISTEN`, (err, stdout) => {
|
cp.exec(`lsof -i :${port} -sTCP:LISTEN -t`, { timeout: 5000 }, (err, stdout) => {
|
||||||
if (err) resolve({ stdout: '' });
|
if (err) resolve({ stdout: '' });
|
||||||
else resolve({ stdout });
|
else resolve({ stdout });
|
||||||
});
|
});
|
||||||
});
|
}).catch(reject);
|
||||||
});
|
});
|
||||||
|
|
||||||
if (stdout.trim()) {
|
if (stdout.trim()) {
|
||||||
// A process is listening on the port
|
const pids = stdout.trim().split('\n')
|
||||||
const pids = stdout.split('\n')
|
.map(s => s.trim())
|
||||||
.map(line => line.trim().split(/\s+/)[1])
|
.filter(Boolean);
|
||||||
.filter(pid => pid && pid !== 'PID');
|
|
||||||
|
|
||||||
if (pids.length > 0) {
|
if (pids.length > 0) {
|
||||||
// Try to kill it if it's not us to avoid connection issues
|
|
||||||
// This happens frequently on HMR / dev reloads
|
|
||||||
if (!this.process || !pids.includes(String(this.process.pid))) {
|
if (!this.process || !pids.includes(String(this.process.pid))) {
|
||||||
logger.info(`Found orphaned process listening on port ${port} (PID: ${pids[0]}), attempting to kill...`);
|
logger.info(`Found orphaned process listening on port ${port} (PIDs: ${pids.join(', ')}), attempting to kill...`);
|
||||||
for (const pid of pids) {
|
for (const pid of pids) {
|
||||||
try { process.kill(parseInt(pid), 'SIGKILL'); } catch { /* ignore */ }
|
try { process.kill(parseInt(pid), 'SIGKILL'); } catch { /* ignore */ }
|
||||||
}
|
}
|
||||||
// Wait a moment for port to be released
|
await new Promise(r => setTimeout(r, 1000));
|
||||||
await new Promise(r => setTimeout(r, 500));
|
return null;
|
||||||
return null; // Return null so we start a fresh instance
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
logger.debug('Error checking for existing process on port:', err);
|
logger.warn('Error checking for existing process on port:', err);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try a quick WebSocket connection to check if gateway is listening
|
// Try a quick WebSocket connection to check if gateway is listening
|
||||||
@@ -797,7 +803,13 @@ export class GatewayManager extends EventEmitter {
|
|||||||
let handshakeTimeout: NodeJS.Timeout | null = null;
|
let handshakeTimeout: NodeJS.Timeout | null = null;
|
||||||
let settled = false;
|
let settled = false;
|
||||||
|
|
||||||
|
let challengeTimer: NodeJS.Timeout | null = null;
|
||||||
|
|
||||||
const cleanupHandshakeRequest = () => {
|
const cleanupHandshakeRequest = () => {
|
||||||
|
if (challengeTimer) {
|
||||||
|
clearTimeout(challengeTimer);
|
||||||
|
challengeTimer = null;
|
||||||
|
}
|
||||||
if (handshakeTimeout) {
|
if (handshakeTimeout) {
|
||||||
clearTimeout(handshakeTimeout);
|
clearTimeout(handshakeTimeout);
|
||||||
handshakeTimeout = null;
|
handshakeTimeout = null;
|
||||||
@@ -917,6 +929,17 @@ export class GatewayManager extends EventEmitter {
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Timeout for receiving the initial connect.challenge from the server.
|
||||||
|
// Without this, if the server never sends the challenge (e.g. orphaned
|
||||||
|
// process from a different version), the connect() promise hangs forever.
|
||||||
|
challengeTimer = setTimeout(() => {
|
||||||
|
if (!challengeReceived && !settled) {
|
||||||
|
logger.error('Gateway connect.challenge not received within timeout');
|
||||||
|
this.ws?.close();
|
||||||
|
rejectOnce(new Error('Timed out waiting for connect.challenge from Gateway'));
|
||||||
|
}
|
||||||
|
}, 10000);
|
||||||
|
|
||||||
this.ws.on('open', () => {
|
this.ws.on('open', () => {
|
||||||
logger.debug('Gateway WebSocket opened, waiting for connect.challenge...');
|
logger.debug('Gateway WebSocket opened, waiting for connect.challenge...');
|
||||||
});
|
});
|
||||||
@@ -934,6 +957,10 @@ export class GatewayManager extends EventEmitter {
|
|||||||
message.type === 'event' && message.event === 'connect.challenge'
|
message.type === 'event' && message.event === 'connect.challenge'
|
||||||
) {
|
) {
|
||||||
challengeReceived = true;
|
challengeReceived = true;
|
||||||
|
if (challengeTimer) {
|
||||||
|
clearTimeout(challengeTimer);
|
||||||
|
challengeTimer = null;
|
||||||
|
}
|
||||||
const nonce = message.payload?.nonce as string | undefined;
|
const nonce = message.payload?.nonce as string | undefined;
|
||||||
if (!nonce) {
|
if (!nonce) {
|
||||||
rejectOnce(new Error('Gateway connect.challenge missing nonce'));
|
rejectOnce(new Error('Gateway connect.challenge missing nonce'));
|
||||||
|
|||||||
Reference in New Issue
Block a user