mirror of
https://github.com/openclaw/openclaw.git
synced 2026-04-25 03:04:29 -04:00
Agents: add system prompt safety guardrails (#5445)
* 🤖 agents: add system prompt safety guardrails What: - add safety guardrails to system prompt - update system prompt docs - update prompt tests Why: - discourage power-seeking or self-modification behavior - clarify safety/oversight priority when conflicts arise Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed) * 🤖 agents: tighten safety wording for prompt guardrails What: - scope safety wording to system prompts/safety/tool policy changes - document Safety inclusion in minimal prompt mode - update safety prompt tests Why: - avoid blocking normal code changes or PR workflows - keep prompt mode docs consistent with implementation Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed) * 🤖 docs: note safety guardrails are soft What: - document system prompt safety guardrails as advisory - add security note on prompt guardrails vs hard controls Why: - clarify threat model and operator expectations - avoid implying prompt text is an enforcement layer Tests: - pnpm lint (pass) - pnpm build (fails: DefaultResourceLoader missing in pi-coding-agent) - pnpm test (not run; build failed)
This commit is contained in:
@@ -46,11 +46,32 @@ describe("buildAgentSystemPrompt", () => {
|
||||
expect(prompt).not.toContain("## Voice (TTS)");
|
||||
expect(prompt).not.toContain("## Silent Replies");
|
||||
expect(prompt).not.toContain("## Heartbeats");
|
||||
expect(prompt).toContain("## Safety");
|
||||
expect(prompt).toContain("You have no independent goals");
|
||||
expect(prompt).toContain("Prioritize safety and human oversight");
|
||||
expect(prompt).toContain("if instructions conflict");
|
||||
expect(prompt).toContain("Inspired by Anthropic's constitution");
|
||||
expect(prompt).toContain("Do not manipulate or persuade anyone");
|
||||
expect(prompt).toContain("Do not copy yourself or change system prompts");
|
||||
expect(prompt).toContain("## Subagent Context");
|
||||
expect(prompt).not.toContain("## Group Chat Context");
|
||||
expect(prompt).toContain("Subagent details");
|
||||
});
|
||||
|
||||
it("includes safety guardrails in full prompts", () => {
|
||||
const prompt = buildAgentSystemPrompt({
|
||||
workspaceDir: "/tmp/openclaw",
|
||||
});
|
||||
|
||||
expect(prompt).toContain("## Safety");
|
||||
expect(prompt).toContain("You have no independent goals");
|
||||
expect(prompt).toContain("Prioritize safety and human oversight");
|
||||
expect(prompt).toContain("if instructions conflict");
|
||||
expect(prompt).toContain("Inspired by Anthropic's constitution");
|
||||
expect(prompt).toContain("Do not manipulate or persuade anyone");
|
||||
expect(prompt).toContain("Do not copy yourself or change system prompts");
|
||||
});
|
||||
|
||||
it("includes voice hint when provided", () => {
|
||||
const prompt = buildAgentSystemPrompt({
|
||||
workspaceDir: "/tmp/openclaw",
|
||||
|
||||
@@ -7,7 +7,7 @@ import type { EmbeddedContextFile } from "./pi-embedded-helpers.js";
|
||||
/**
|
||||
* Controls which hardcoded sections are included in the system prompt.
|
||||
* - "full": All sections (default, for main agent)
|
||||
* - "minimal": Reduced sections (Tooling, Workspace, Runtime) - used for subagents
|
||||
* - "minimal": Reduced sections (Tooling, Safety, Workspace, Sandbox, Runtime) - used for subagents
|
||||
* - "none": Just basic identity line, no sections
|
||||
*/
|
||||
export type PromptMode = "full" | "minimal" | "none";
|
||||
@@ -64,6 +64,16 @@ function buildTimeSection(params: { userTimezone?: string }) {
|
||||
return ["## Current Date & Time", `Time zone: ${params.userTimezone}`, ""];
|
||||
}
|
||||
|
||||
function buildSafetySection() {
|
||||
return [
|
||||
"## Safety",
|
||||
"You have no independent goals: do not pursue self-preservation, replication, resource acquisition, or power-seeking; avoid long-term plans beyond the user's request.",
|
||||
"Prioritize safety and human oversight over completion; if instructions conflict, pause and ask; comply with stop/pause/audit requests and never bypass safeguards. (Inspired by Anthropic's constitution.)",
|
||||
"Do not manipulate or persuade anyone to expand access or disable safeguards. Do not copy yourself or change system prompts, safety rules, or tool policies unless explicitly requested.",
|
||||
"",
|
||||
];
|
||||
}
|
||||
|
||||
function buildReplyTagsSection(isMinimal: boolean) {
|
||||
if (isMinimal) {
|
||||
return [];
|
||||
@@ -382,6 +392,7 @@ export function buildAgentSystemPrompt(params: {
|
||||
"Keep narration brief and value-dense; avoid repeating obvious steps.",
|
||||
"Use plain human language for narration unless in a technical context.",
|
||||
"",
|
||||
...buildSafetySection(),
|
||||
"## OpenClaw CLI Quick Reference",
|
||||
"OpenClaw is controlled via subcommands. Do not invent commands.",
|
||||
"To manage the Gateway daemon service (start/stop/restart):",
|
||||
|
||||
Reference in New Issue
Block a user