Agents: add system prompt safety guardrails (#5445)

* 🤖 agents: add system prompt safety guardrails What: - add safety guardrails to system prompt - update system prompt docs - update prompt tests Why: - discourage power-seeking or self-modification behavior - clarify safety/oversight priority when conflicts arise Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed) * 🤖 agents: tighten safety wording for prompt guardrails What: - scope safety wording to system prompts/safety/tool policy changes - document Safety inclusion in minimal prompt mode - update safety prompt tests Why: - avoid blocking normal code changes or PR workflows - keep prompt mode docs consistent with implementation Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed) * 🤖 docs: note safety guardrails are soft What: - document system prompt safety guardrails as advisory - add security note on prompt guardrails vs hard controls Why: - clarify threat model and operator expectations - avoid implying prompt text is an enforcement layer Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed)
2026-04-25 03:04:29 -04:00 · 2026-01-31 15:50:15 +01:00
parent 75093ebe1c
commit 7a6c40872d
5 changed files with 41 additions and 6 deletions
--- a/src/agents/system-prompt.test.ts
+++ b/src/agents/system-prompt.test.ts
@@ -46,11 +46,32 @@ describe("buildAgentSystemPrompt", () => {
    expect(prompt).not.toContain("## Voice (TTS)");
    expect(prompt).not.toContain("## Silent Replies");
    expect(prompt).not.toContain("## Heartbeats");
+    expect(prompt).toContain("## Safety");
+    expect(prompt).toContain("You have no independent goals");
+    expect(prompt).toContain("Prioritize safety and human oversight");
+    expect(prompt).toContain("if instructions conflict");
+    expect(prompt).toContain("Inspired by Anthropic's constitution");
+    expect(prompt).toContain("Do not manipulate or persuade anyone");
+    expect(prompt).toContain("Do not copy yourself or change system prompts");
    expect(prompt).toContain("## Subagent Context");
    expect(prompt).not.toContain("## Group Chat Context");
    expect(prompt).toContain("Subagent details");
  });

+  it("includes safety guardrails in full prompts", () => {
+    const prompt = buildAgentSystemPrompt({
+      workspaceDir: "/tmp/openclaw",
+    });
+
+    expect(prompt).toContain("## Safety");
+    expect(prompt).toContain("You have no independent goals");
+    expect(prompt).toContain("Prioritize safety and human oversight");
+    expect(prompt).toContain("if instructions conflict");
+    expect(prompt).toContain("Inspired by Anthropic's constitution");
+    expect(prompt).toContain("Do not manipulate or persuade anyone");
+    expect(prompt).toContain("Do not copy yourself or change system prompts");
+  });
+
  it("includes voice hint when provided", () => {
    const prompt = buildAgentSystemPrompt({
      workspaceDir: "/tmp/openclaw",
--- a/src/agents/system-prompt.ts
+++ b/src/agents/system-prompt.ts
@@ -7,7 +7,7 @@ import type { EmbeddedContextFile } from "./pi-embedded-helpers.js";
 /**
 * Controls which hardcoded sections are included in the system prompt.
 * - "full": All sections (default, for main agent)
- * - "minimal": Reduced sections (Tooling, Workspace, Runtime) - used for subagents
+ * - "minimal": Reduced sections (Tooling, Safety, Workspace, Sandbox, Runtime) - used for subagents
 * - "none": Just basic identity line, no sections
 */
 export type PromptMode = "full" | "minimal" | "none";
@@ -64,6 +64,16 @@ function buildTimeSection(params: { userTimezone?: string }) {
  return ["## Current Date & Time", `Time zone: ${params.userTimezone}`, ""];
 }

+function buildSafetySection() {
+  return [
+    "## Safety",
+    "You have no independent goals: do not pursue self-preservation, replication, resource acquisition, or power-seeking; avoid long-term plans beyond the user's request.",
+    "Prioritize safety and human oversight over completion; if instructions conflict, pause and ask; comply with stop/pause/audit requests and never bypass safeguards. (Inspired by Anthropic's constitution.)",
+    "Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested.",
+    "",
+  ];
+}
+
 function buildReplyTagsSection(isMinimal: boolean) {
  if (isMinimal) {
    return [];
@@ -382,6 +392,7 @@ export function buildAgentSystemPrompt(params: {
    "Keep narration brief and value-dense; avoid repeating obvious steps.",
    "Use plain human language for narration unless in a technical context.",
    "",
+    ...buildSafetySection(),
    "## OpenClaw CLI Quick Reference",
    "OpenClaw is controlled via subcommands. Do not invent commands.",
    "To manage the Gateway daemon service (start/stop/restart):",