mirror of
https://github.com/openclaw/openclaw.git
synced 2026-02-19 18:39:20 -05:00
chore(gateway): use ws bind as lock
This commit is contained in:
@@ -1,28 +1,28 @@
|
||||
---
|
||||
summary: "Gateway lock strategy using POSIX flock and PID file"
|
||||
summary: "Gateway singleton guard using the WebSocket listener bind"
|
||||
read_when:
|
||||
- Running or debugging the gateway process
|
||||
- Investigating single-instance enforcement
|
||||
---
|
||||
# Gateway lock
|
||||
|
||||
Last updated: 2025-12-10
|
||||
Last updated: 2025-12-11
|
||||
|
||||
## Why
|
||||
- Ensure only one gateway instance runs per host.
|
||||
- Survive crashes/SIGKILL without leaving a blocking stale lock.
|
||||
- Keep the PID visible for observability and manual debugging.
|
||||
- Survive crashes/SIGKILL without leaving stale lock files.
|
||||
- Fail fast with a clear error when the control port is already occupied.
|
||||
|
||||
## Mechanism
|
||||
- Uses a single lock file (default `${os.tmpdir()}/clawdis-gateway.lock`, e.g. `/var/folders/.../clawdis-gateway.lock` on macOS) opened once per process.
|
||||
- An exclusive, non-blocking POSIX `flock` is taken on the file descriptor. The kernel releases the lock automatically on any process exit, including crashes and SIGKILL.
|
||||
- The PID is written into the same file after locking; the lock (not file existence) is the source of truth.
|
||||
- On graceful shutdown, we best-effort unlock, close, and unlink the file to reduce crumbs, but correctness does not rely on cleanup.
|
||||
- The gateway binds the WebSocket listener (default `ws://127.0.0.1:18789`) immediately on startup using an exclusive TCP listener.
|
||||
- If the bind fails with `EADDRINUSE`, startup throws `GatewayLockError("another gateway instance is already listening on ws://127.0.0.1:<port>")`.
|
||||
- The OS releases the listener automatically on any process exit, including crashes and SIGKILL—no separate lock file or cleanup step is needed.
|
||||
- On shutdown the gateway closes the WebSocket server and underlying HTTP server to free the port promptly.
|
||||
|
||||
## Error surface
|
||||
- If another instance holds the lock, startup throws `GatewayLockError("another gateway instance is already running")`.
|
||||
- Unexpected `flock` failures surface as `GatewayLockError("failed to acquire gateway lock: …")`.
|
||||
- If another process holds the port, startup throws `GatewayLockError("another gateway instance is already listening on ws://127.0.0.1:<port>")`.
|
||||
- Other bind failures surface as `GatewayLockError("failed to bind gateway socket on ws://127.0.0.1:<port>: …")`.
|
||||
|
||||
## Operational notes
|
||||
- The lock file may remain on disk after abnormal exits; this is expected and harmless because the kernel lock is gone.
|
||||
- If you need to inspect, `cat /tmp/clawdis-gateway.lock` shows the last PID. Do not delete the file while a process is running—you would only remove the convenience marker, not the lock itself.
|
||||
- If the port is occupied by *another* process, the error is the same; free the port or choose another with `clawdis gateway --port <port>`.
|
||||
- The macOS app still maintains its own lightweight PID guard before spawning the gateway; the runtime lock is enforced by the WebSocket bind.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
## Tests
|
||||
|
||||
- `pnpm test:force`: Kills any lingering gateway process holding the default lock/port, removes stale lock files, runs the full Vitest suite with an isolated temporary gateway lock path so gateway server tests don’t collide with a running instance. Use this when a prior gateway run left `/tmp/clawdis-gateway.lock` or port 18789 occupied.
|
||||
- `pnpm test:force`: Kills any lingering gateway process holding the default control port, then runs the full Vitest suite with an isolated gateway port so server tests don’t collide with a running instance. Use this when a prior gateway run left port 18789 occupied.
|
||||
- `pnpm test:coverage`: Runs Vitest with V8 coverage. Global thresholds are 70% lines/functions/statements and 55% branches. Coverage excludes integration-heavy entrypoints (CLI wiring, gateway/telegram bridges, webchat static server) to keep the target focused on unit-testable logic.
|
||||
|
||||
@@ -46,7 +46,6 @@
|
||||
"detect-libc": "^2.1.2",
|
||||
"dotenv": "^17.2.3",
|
||||
"express": "^5.2.1",
|
||||
"fs-ext": "^2.1.1",
|
||||
"grammy": "^1.38.4",
|
||||
"json5": "^2.2.3",
|
||||
"qrcode-terminal": "^0.12.0",
|
||||
@@ -61,7 +60,6 @@
|
||||
"@mariozechner/mini-lit": "0.2.1",
|
||||
"@types/body-parser": "^1.19.6",
|
||||
"@types/express": "^5.0.6",
|
||||
"@types/fs-ext": "^2.0.3",
|
||||
"@types/node": "^24.10.2",
|
||||
"@types/qrcode-terminal": "^0.12.2",
|
||||
"@types/ws": "^8.18.1",
|
||||
|
||||
@@ -1,48 +1,10 @@
|
||||
#!/usr/bin/env tsx
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { spawnSync } from "node:child_process";
|
||||
import { forceFreePort, type PortProcess } from "../src/cli/ports.js";
|
||||
|
||||
const DEFAULT_PORT = 18789;
|
||||
const DEFAULT_LOCK = path.join(os.tmpdir(), "clawdis-gateway.lock");
|
||||
|
||||
function killPid(pid: number, reason: string) {
|
||||
try {
|
||||
process.kill(pid, "SIGTERM");
|
||||
console.log(`sent SIGTERM to ${pid} (${reason})`);
|
||||
} catch (err) {
|
||||
if ((err as NodeJS.ErrnoException).code === "ESRCH") {
|
||||
console.log(`pid ${pid} (${reason}) not running`);
|
||||
} else {
|
||||
console.error(`failed to kill ${pid} (${reason}): ${String(err)}`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function killLockHolder(lockPath: string) {
|
||||
if (!fs.existsSync(lockPath)) return;
|
||||
try {
|
||||
const contents = fs.readFileSync(lockPath, "utf8").trim();
|
||||
const pid = Number.parseInt(contents.split("\n")[0] ?? "", 10);
|
||||
if (Number.isFinite(pid)) {
|
||||
killPid(pid, "gateway lock holder");
|
||||
}
|
||||
} catch (err) {
|
||||
console.error(`failed to read lock ${lockPath}: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
|
||||
function cleanupLock(lockPath: string) {
|
||||
if (!fs.existsSync(lockPath)) return;
|
||||
try {
|
||||
fs.rmSync(lockPath, { force: true });
|
||||
console.log(`removed gateway lock: ${lockPath}`);
|
||||
} catch (err) {
|
||||
console.error(`failed to remove lock ${lockPath}: ${String(err)}`);
|
||||
}
|
||||
}
|
||||
|
||||
function killGatewayListeners(port: number): PortProcess[] {
|
||||
try {
|
||||
@@ -86,16 +48,13 @@ function main() {
|
||||
process.env.CLAWDIS_GATEWAY_PORT ?? `${DEFAULT_PORT}`,
|
||||
10,
|
||||
);
|
||||
const lockPath = process.env.CLAWDIS_GATEWAY_LOCK ?? DEFAULT_LOCK;
|
||||
|
||||
console.log(`🧹 test:force - clearing gateway on port ${port}`);
|
||||
killLockHolder(lockPath);
|
||||
const killed = killGatewayListeners(port);
|
||||
if (killed.length === 0) {
|
||||
console.log("no listeners to kill");
|
||||
}
|
||||
|
||||
cleanupLock(lockPath);
|
||||
console.log("running pnpm test…");
|
||||
runTests();
|
||||
}
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import { randomUUID } from "node:crypto";
|
||||
import { type AddressInfo, createServer } from "node:net";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, test, vi } from "vitest";
|
||||
import { describe, expect, test, vi } from "vitest";
|
||||
import { WebSocket } from "ws";
|
||||
import { emitAgentEvent } from "../infra/agent-events.js";
|
||||
import { startGatewayServer } from "./server.js";
|
||||
import { GatewayLockError } from "../infra/gateway-lock.js";
|
||||
|
||||
vi.mock("../commands/health.js", () => ({
|
||||
getHealthSnapshot: vi.fn().mockResolvedValue({ ok: true, stub: true }),
|
||||
@@ -27,19 +25,6 @@ vi.mock("../commands/agent.js", () => ({
|
||||
|
||||
process.env.CLAWDIS_SKIP_PROVIDERS = "1";
|
||||
|
||||
const originalLockPath = process.env.CLAWDIS_GATEWAY_LOCK_PATH;
|
||||
|
||||
beforeEach(() => {
|
||||
process.env.CLAWDIS_GATEWAY_LOCK_PATH = path.join(
|
||||
os.tmpdir(),
|
||||
`clawdis-gateway-${randomUUID()}.lock`,
|
||||
);
|
||||
});
|
||||
|
||||
afterEach(() => {
|
||||
process.env.CLAWDIS_GATEWAY_LOCK_PATH = originalLockPath;
|
||||
});
|
||||
|
||||
async function getFreePort(): Promise<number> {
|
||||
return await new Promise((resolve, reject) => {
|
||||
const server = createServer();
|
||||
@@ -50,6 +35,17 @@ async function getFreePort(): Promise<number> {
|
||||
});
|
||||
}
|
||||
|
||||
async function occupyPort(): Promise<{ server: ReturnType<typeof createServer>; port: number }> {
|
||||
return await new Promise((resolve, reject) => {
|
||||
const server = createServer();
|
||||
server.once("error", reject);
|
||||
server.listen(0, "127.0.0.1", () => {
|
||||
const port = (server.address() as AddressInfo).port;
|
||||
resolve({ server, port });
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function onceMessage<T = unknown>(
|
||||
ws: WebSocket,
|
||||
filter: (obj: unknown) => boolean,
|
||||
@@ -654,4 +650,12 @@ describe("gateway server", () => {
|
||||
ws.close();
|
||||
await server.close();
|
||||
});
|
||||
|
||||
test("refuses to start when port already bound", async () => {
|
||||
const { server: blocker, port } = await occupyPort();
|
||||
await expect(startGatewayServer(port)).rejects.toBeInstanceOf(
|
||||
GatewayLockError,
|
||||
);
|
||||
blocker.close();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -2,6 +2,7 @@ import { randomUUID } from "node:crypto";
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
import { createServer as createHttpServer, type Server as HttpServer } from "node:http";
|
||||
import chalk from "chalk";
|
||||
import { type WebSocket, WebSocketServer } from "ws";
|
||||
import { createDefaultDeps } from "../cli/deps.js";
|
||||
@@ -17,7 +18,7 @@ import {
|
||||
} from "../config/sessions.js";
|
||||
import { isVerbose } from "../globals.js";
|
||||
import { onAgentEvent } from "../infra/agent-events.js";
|
||||
import { acquireGatewayLock, GatewayLockError } from "../infra/gateway-lock.js";
|
||||
import { GatewayLockError } from "../infra/gateway-lock.js";
|
||||
import { enqueueSystemEvent } from "../infra/system-events.js";
|
||||
import {
|
||||
listSystemPresence,
|
||||
@@ -255,15 +256,38 @@ export async function startGatewayServer(
|
||||
port = 18789,
|
||||
opts?: { webchatPort?: number },
|
||||
): Promise<GatewayServer> {
|
||||
const releaseLock = await acquireGatewayLock().catch((err) => {
|
||||
// Bubble known lock errors so callers can present a nice message.
|
||||
if (err instanceof GatewayLockError) throw err;
|
||||
throw new GatewayLockError(String(err));
|
||||
});
|
||||
const host = "127.0.0.1";
|
||||
const httpServer: HttpServer = createHttpServer();
|
||||
try {
|
||||
await new Promise<void>((resolve, reject) => {
|
||||
const onError = (err: NodeJS.ErrnoException) => {
|
||||
httpServer.off("listening", onListening);
|
||||
reject(err);
|
||||
};
|
||||
const onListening = () => {
|
||||
httpServer.off("error", onError);
|
||||
resolve();
|
||||
};
|
||||
httpServer.once("error", onError);
|
||||
httpServer.once("listening", onListening);
|
||||
httpServer.listen(port, host);
|
||||
});
|
||||
} catch (err) {
|
||||
const code = (err as NodeJS.ErrnoException).code;
|
||||
if (code === "EADDRINUSE") {
|
||||
throw new GatewayLockError(
|
||||
`another gateway instance is already listening on ws://${host}:${port}`,
|
||||
err,
|
||||
);
|
||||
}
|
||||
throw new GatewayLockError(
|
||||
`failed to bind gateway socket on ws://${host}:${port}: ${String(err)}`,
|
||||
err,
|
||||
);
|
||||
}
|
||||
|
||||
const wss = new WebSocketServer({
|
||||
port,
|
||||
host: "127.0.0.1",
|
||||
server: httpServer,
|
||||
maxPayload: MAX_PAYLOAD_BYTES,
|
||||
});
|
||||
const providerAbort = new AbortController();
|
||||
@@ -1183,7 +1207,6 @@ export async function startGatewayServer(
|
||||
|
||||
return {
|
||||
close: async () => {
|
||||
await releaseLock();
|
||||
providerAbort.abort();
|
||||
broadcast("shutdown", {
|
||||
reason: "gateway stopping",
|
||||
@@ -1211,6 +1234,9 @@ export async function startGatewayServer(
|
||||
clients.clear();
|
||||
await Promise.allSettled(providerTasks);
|
||||
await new Promise<void>((resolve) => wss.close(() => resolve()));
|
||||
await new Promise<void>((resolve, reject) =>
|
||||
httpServer.close((err) => (err ? reject(err) : resolve())),
|
||||
);
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
@@ -1,34 +0,0 @@
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { describe, expect, it } from "vitest";
|
||||
|
||||
import { acquireGatewayLock, GatewayLockError } from "./gateway-lock.js";
|
||||
|
||||
const newLockPath = () =>
|
||||
path.join(
|
||||
os.tmpdir(),
|
||||
`clawdis-gateway-lock-test-${process.pid}-${Math.random().toString(16).slice(2)}.sock`,
|
||||
);
|
||||
|
||||
describe("gateway-lock", () => {
|
||||
it("prevents concurrent gateway instances and releases cleanly", async () => {
|
||||
const lockPath = newLockPath();
|
||||
|
||||
const release1 = await acquireGatewayLock(lockPath);
|
||||
expect(fs.existsSync(lockPath)).toBe(true);
|
||||
|
||||
await expect(acquireGatewayLock(lockPath)).rejects.toBeInstanceOf(
|
||||
GatewayLockError,
|
||||
);
|
||||
|
||||
await release1();
|
||||
expect(fs.existsSync(lockPath)).toBe(false);
|
||||
|
||||
// After release, lock can be reacquired.
|
||||
const release2 = await acquireGatewayLock(lockPath);
|
||||
await release2();
|
||||
expect(fs.existsSync(lockPath)).toBe(false);
|
||||
});
|
||||
});
|
||||
@@ -1,84 +1,6 @@
|
||||
import fs from "node:fs";
|
||||
import os from "node:os";
|
||||
import path from "node:path";
|
||||
|
||||
import { flockSync } from "fs-ext";
|
||||
import { getLogger } from "../logging.js";
|
||||
|
||||
const defaultLockPath = () =>
|
||||
process.env.CLAWDIS_GATEWAY_LOCK_PATH ??
|
||||
path.join(os.tmpdir(), "clawdis-gateway.lock");
|
||||
|
||||
export class GatewayLockError extends Error {}
|
||||
|
||||
type ReleaseFn = () => Promise<void>;
|
||||
|
||||
const SIGNALS: NodeJS.Signals[] = ["SIGINT", "SIGTERM", "SIGHUP"];
|
||||
|
||||
/**
|
||||
* Acquire an exclusive gateway lock using POSIX flock and write the PID into the same file.
|
||||
*
|
||||
* Kernel locks are released automatically when the process exits or is SIGKILLed, so the
|
||||
* lock cannot become stale. A best-effort unlink on shutdown keeps the path clean, but
|
||||
* correctness relies solely on the kernel lock.
|
||||
*/
|
||||
export async function acquireGatewayLock(
|
||||
lockPath = defaultLockPath(),
|
||||
): Promise<ReleaseFn> {
|
||||
fs.mkdirSync(path.dirname(lockPath), { recursive: true });
|
||||
|
||||
const fd = fs.openSync(lockPath, "w+");
|
||||
try {
|
||||
flockSync(fd, "exnb");
|
||||
} catch (err) {
|
||||
fs.closeSync(fd);
|
||||
const code = (err as NodeJS.ErrnoException).code;
|
||||
if (code === "EWOULDBLOCK" || code === "EAGAIN") {
|
||||
throw new GatewayLockError("another gateway instance is already running");
|
||||
}
|
||||
throw new GatewayLockError(
|
||||
`failed to acquire gateway lock: ${(err as Error).message}`,
|
||||
);
|
||||
export class GatewayLockError extends Error {
|
||||
constructor(message: string, public readonly cause?: unknown) {
|
||||
super(message);
|
||||
this.name = "GatewayLockError";
|
||||
}
|
||||
|
||||
fs.ftruncateSync(fd, 0);
|
||||
fs.writeSync(fd, `${process.pid}\n`, 0, "utf8");
|
||||
fs.fsyncSync(fd);
|
||||
getLogger().info({ pid: process.pid, lockPath }, "gateway lock acquired");
|
||||
|
||||
let released = false;
|
||||
const release = async (): Promise<void> => {
|
||||
if (released) return;
|
||||
released = true;
|
||||
try {
|
||||
flockSync(fd, "un");
|
||||
} catch {
|
||||
/* ignore unlock errors */
|
||||
}
|
||||
try {
|
||||
fs.closeSync(fd);
|
||||
} catch {
|
||||
/* ignore close errors */
|
||||
}
|
||||
try {
|
||||
fs.rmSync(lockPath, { force: true });
|
||||
} catch {
|
||||
/* ignore unlink errors */
|
||||
}
|
||||
};
|
||||
|
||||
const handleSignal = () => {
|
||||
void release();
|
||||
process.exit(0);
|
||||
};
|
||||
|
||||
for (const sig of SIGNALS) {
|
||||
process.once(sig, handleSignal);
|
||||
}
|
||||
|
||||
process.once("exit", () => {
|
||||
void release();
|
||||
});
|
||||
|
||||
return release;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user