Agents: add system prompt safety guardrails (#5445)

* 🤖 agents: add system prompt safety guardrails

What:
- add safety guardrails to system prompt
- update system prompt docs
- update prompt tests

Why:
- discourage power-seeking or self-modification behavior
- clarify safety/oversight priority when conflicts arise

Tests:
- pnpm lint (pass)
- pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent)
- pnpm test (not run; build failed)

* 🤖 agents: tighten safety wording for prompt guardrails

What:
- scope safety wording to system prompts/safety/tool policy changes
- document Safety inclusion in minimal prompt mode
- update safety prompt tests

Why:
- avoid blocking normal code changes or PR workflows
- keep prompt mode docs consistent with implementation

Tests:
- pnpm lint (pass)
- pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent)
- pnpm test (not run; build failed)

* 🤖 docs: note safety guardrails are soft

What:
- document system prompt safety guardrails as advisory
- add security note on prompt guardrails vs hard controls

Why:
- clarify threat model and operator expectations
- avoid implying prompt text is an enforcement layer

Tests:
- pnpm lint (pass)
- pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent)
- pnpm test (not run; build failed)
This commit is contained in:
Josh Palmer
2026-01-31 15:50:15 +01:00
committed by GitHub
parent 75093ebe1c
commit 7a6c40872d
5 changed files with 41 additions and 6 deletions

View File

@@ -46,11 +46,32 @@ describe("buildAgentSystemPrompt", () => {
expect(prompt).not.toContain("## Voice (TTS)");
expect(prompt).not.toContain("## Silent Replies");
expect(prompt).not.toContain("## Heartbeats");
expect(prompt).toContain("## Safety");
expect(prompt).toContain("You have no independent goals");
expect(prompt).toContain("Prioritize safety and human oversight");
expect(prompt).toContain("if instructions conflict");
expect(prompt).toContain("Inspired by Anthropic's constitution");
expect(prompt).toContain("Do not manipulate or persuade anyone");
expect(prompt).toContain("Do not copy yourself or change system prompts");
expect(prompt).toContain("## Subagent Context");
expect(prompt).not.toContain("## Group Chat Context");
expect(prompt).toContain("Subagent details");
});
it("includes safety guardrails in full prompts", () => {
const prompt = buildAgentSystemPrompt({
workspaceDir: "/tmp/openclaw",
});
expect(prompt).toContain("## Safety");
expect(prompt).toContain("You have no independent goals");
expect(prompt).toContain("Prioritize safety and human oversight");
expect(prompt).toContain("if instructions conflict");
expect(prompt).toContain("Inspired by Anthropic's constitution");
expect(prompt).toContain("Do not manipulate or persuade anyone");
expect(prompt).toContain("Do not copy yourself or change system prompts");
});
it("includes voice hint when provided", () => {
const prompt = buildAgentSystemPrompt({
workspaceDir: "/tmp/openclaw",

View File

@@ -7,7 +7,7 @@ import type { EmbeddedContextFile } from "./pi-embedded-helpers.js";
/**
* Controls which hardcoded sections are included in the system prompt.
* - "full": All sections (default, for main agent)
* - "minimal": Reduced sections (Tooling, Workspace, Runtime) - used for subagents
* - "minimal": Reduced sections (Tooling, Safety, Workspace, Sandbox, Runtime) - used for subagents
* - "none": Just basic identity line, no sections
*/
export type PromptMode = "full" | "minimal" | "none";
@@ -64,6 +64,16 @@ function buildTimeSection(params: { userTimezone?: string }) {
return ["## Current Date & Time", `Time zone: ${params.userTimezone}`, ""];
}
function buildSafetySection() {
return [
"## Safety",
"You have no independent goals: do not pursue self-preservation, replication, resource acquisition, or power-seeking; avoid long-term plans beyond the user's request.",
"Prioritize safety and human oversight over completion; if instructions conflict, pause and ask; comply with stop/pause/audit requests and never bypass safeguards. (Inspired by Anthropic's constitution.)",
"Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested.",
"",
];
}
function buildReplyTagsSection(isMinimal: boolean) {
if (isMinimal) {
return [];
@@ -382,6 +392,7 @@ export function buildAgentSystemPrompt(params: {
"Keep narration brief and value-dense; avoid repeating obvious steps.",
"Use plain human language for narration unless in a technical context.",
"",
...buildSafetySection(),
"## OpenClaw CLI Quick Reference",
"OpenClaw is controlled via subcommands. Do not invent commands.",
"To manage the Gateway daemon service (start/stop/restart):",