openclaw/scripts/sync-credits.py

#!/usr/bin/env python3
"""
Sync maintainers and contributors in docs/reference/credits.md from git/GitHub.

- Maintainers: people who have merged PRs (via GitHub API) + direct pushes to main
- Contributors: all unique commit authors on main with commit counts

Usage: python scripts/sync-credits.py
"""

import re
import subprocess
from datetime import datetime, timezone
from pathlib import Path

REPO_ROOT = Path(__file__).parent.parent
CREDITS_FILE = REPO_ROOT / "docs" / "reference" / "credits.md"
REPO = "openclaw/openclaw"

# Exclude bot accounts from maintainer list
EXCLUDED_MAINTAINERS = {
    "app/clawdinator",
    "clawdinator",
    "github-actions",
    "dependabot",
}

# Exclude bot/system names from contributor list
EXCLUDED_CONTRIBUTORS = {
    "GitHub",
    "github-actions[bot]",
    "dependabot[bot]",
    "clawdinator[bot]",
    "blacksmith-sh[bot]",
    "google-labs-jules[bot]",
    "Maude Bot",
    "Pocket Clawd",
    "Ghost",
    "Gregor's Bot",
    "Jarvis",
    "Jarvis Deploy",
    "CI",
    "Ubuntu",
    "user",
    "Developer",
    # Bot names that appear in git history
    "CLAWDINATOR Bot",
    "Clawd",
    "Clawdbot",
    "Clawdbot Maintainers",
    "Claude Code",
    "L36 Server",
    "seans-openclawbot",
    "therealZpoint-bot",
    "Vultr-Clawd Admin",
    "hyf0-agent",
}

# Minimum merged PRs to be considered a maintainer
MIN_MERGES = 2


# Regex to extract GitHub username from noreply email
# Matches: ID+username@users.noreply.github.com or username@users.noreply.github.com
GITHUB_NOREPLY_RE = re.compile(r"^(?:\d+\+)?([^@]+)@users\.noreply\.github\.com$", re.I)


def extract_github_username(email: str) -> str | None:
    """Extract GitHub username from noreply email, or return None."""
    match = GITHUB_NOREPLY_RE.match(email)
    return match.group(1).lower() if match else None


def sanitize_name(name: str) -> str:
    """Sanitize name for MDX by removing curly braces (which MDX interprets as JS)."""
    return name.replace("{", "").replace("}", "").strip()


def run_git(*args: str) -> str:
    """Run git command and return stdout."""
    result = subprocess.run(
        ["git", *args],
        cwd=REPO_ROOT,
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace",
        check=True,
    )
    return result.stdout.strip()


def run_gh(*args: str) -> str:
    """Run gh CLI command and return stdout."""
    result = subprocess.run(
        ["gh", *args],
        cwd=REPO_ROOT,
        capture_output=True,
        text=True,
        encoding="utf-8",
        errors="replace",
        check=True,
    )
    return result.stdout.strip()


def categorize_commit_files(files: list[str]) -> str:
    """Categorize a commit based on its changed files.

    Returns: 'ci', 'docs only', 'docs', or 'other'
    - 'ci': any commit with CI files (.github/, scripts/ci*)
    - 'docs only': only documentation files (docs/ or any .md)
    - 'docs': docs + other files mixed
    - 'other': code without CI or docs
    """
    has_ci = False
    has_docs = False
    has_other = False

    for f in files:
        f_lower = f.lower()
        if f_lower.startswith(".github/") or f_lower.startswith("scripts/ci"):
            has_ci = True
        elif f_lower.startswith("docs/") or f_lower.endswith(".md"):
            has_docs = True
        else:
            has_other = True

    # CI takes priority if present
    if has_ci:
        return "ci"
    if has_other:
        if has_docs:
            return "docs"  # Mixed: docs + other
        return "other"  # Pure code
    if has_docs:
        return "docs only"  # Pure docs
    return "other"


def get_maintainers() -> list[tuple[str, int, dict[str, int]]]:
    """Get maintainers with (login, merge_count, push_counts_by_category).

    - Merges: from GitHub API (who clicked "merge")
    - Direct pushes: non-merge commits to main (by committer name matching login)
      categorized into 'ci', 'docs', 'other'
    """
    # 1. Fetch ALL merged PRs using gh pr list (handles pagination automatically)
    print("  Fetching merged PRs from GitHub API...")
    output = run_gh(
        "pr",
        "list",
        "--repo",
        REPO,
        "--state",
        "merged",
        "--limit",
        "10000",
        "--json",
        "mergedBy",
        "--jq",
        ".[].mergedBy.login",
    )

    merge_counts: dict[str, int] = {}
    if output:
        for login in output.strip().splitlines():
            login = login.strip()
            if login and login not in EXCLUDED_MAINTAINERS:
                merge_counts[login] = merge_counts.get(login, 0) + 1

    print(
        f"  Found {sum(merge_counts.values())} merged PRs by {len(merge_counts)} users"
    )

    # 2. Count direct pushes (non-merge commits by committer) with categories
    # Use GitHub username from noreply emails, or committer name as fallback
    print("  Counting direct pushes from git history...")
    # push_counts[key] = {"ci": N, "docs only": N, "docs": N, "other": N}
    push_counts: dict[str, dict[str, int]] = {}

    # Get commits with files using a delimiter to parse
    output = run_git(
        "log", "main", "--no-merges", "--format=COMMIT|%cN|%cE", "--name-only"
    )

    current_key: str | None = None
    current_files: list[str] = []

    def flush_commit() -> None:
        nonlocal current_key, current_files
        if current_key and current_files:
            category = categorize_commit_files(current_files)
            if current_key not in push_counts:
                push_counts[current_key] = {
                    "ci": 0,
                    "docs only": 0,
                    "docs": 0,
                    "other": 0,
                }
            push_counts[current_key][category] += 1
        current_key = None
        current_files = []

    for line in output.splitlines():
        line = line.strip()
        if not line:
            continue

        if line.startswith("COMMIT|"):
            # Flush previous commit
            flush_commit()
            # Parse new commit
            parts = line.split("|", 2)
            if len(parts) < 3:
                continue
            _, name, email = parts
            name = name.strip()
            email = email.strip().lower()
            if not name or name in EXCLUDED_CONTRIBUTORS:
                current_key = None
                continue

            # Use GitHub username from noreply email if available
            gh_user = extract_github_username(email)
            current_key = gh_user if gh_user else name.lower()
        else:
            # This is a file path
            if current_key:
                current_files.append(line)

    # Flush last commit
    flush_commit()

    # 3. Build maintainer list: anyone with merges >= MIN_MERGES
    maintainers: list[tuple[str, int, dict[str, int]]] = []

    for login, merges in merge_counts.items():
        if merges >= MIN_MERGES:
            # Try to find matching push count (case-insensitive)
            pushes = push_counts.get(
                login.lower(), {"ci": 0, "docs only": 0, "docs": 0, "other": 0}
            )
            maintainers.append((login, merges, pushes))

    # Sort by total activity (merges + sum of pushes) descending
    maintainers.sort(key=lambda x: (-(x[1] + sum(x[2].values())), x[0].lower()))
    return maintainers


def get_contributors() -> list[tuple[str, int]]:
    """Get all unique commit authors on main with commit counts.

    Merges authors by:
    1. GitHub username (extracted from noreply emails)
    2. Author name matching a known GitHub username
    3. Display name (case-insensitive) as final fallback
    """
    output = run_git("log", "main", "--format=%aN|%aE")
    if not output:
        return []

    # First pass: collect all known GitHub usernames from noreply emails
    known_github_users: set[str] = set()

    for line in output.splitlines():
        line = line.strip()
        if not line or "|" not in line:
            continue
        _, email = line.rsplit("|", 1)
        email = email.strip().lower()
        if not email:
            continue
        gh_user = extract_github_username(email)
        if gh_user:
            known_github_users.add(gh_user)

    # Second pass: count commits and pick canonical names
    # Key priority: gh:username > name:lowercasename
    counts: dict[str, int] = {}
    canonical: dict[str, str] = {}  # key -> preferred display name

    for line in output.splitlines():
        line = line.strip()
        if not line or "|" not in line:
            continue
        name, email = line.rsplit("|", 1)
        name = name.strip()
        email = email.strip().lower()
        if not name or not email or name in EXCLUDED_CONTRIBUTORS:
            continue

        # Sanitize name for MDX safety and consistent deduplication
        sanitized = sanitize_name(name)
        if not sanitized:
            continue

        # Determine the merge key:
        # 1. If email is a noreply email, use the extracted GitHub username
        # 2. If the author name matches a known GitHub username, use that
        # 3. Otherwise use the sanitized display name (case-insensitive)
        gh_user = extract_github_username(email)
        if gh_user:
            key = f"gh:{gh_user}"
        elif sanitized.lower() in known_github_users:
            key = f"gh:{sanitized.lower()}"
        else:
            key = f"name:{sanitized.lower()}"

        counts[key] = counts.get(key, 0) + 1

        # Prefer capitalized version, or longer name (more specific)
        if key not in canonical or (
            (sanitized[0].isupper() and not canonical[key][0].isupper())
            or (
                sanitized[0].isupper() == canonical[key][0].isupper()
                and len(sanitized) > len(canonical[key])
            )
        ):
            canonical[key] = sanitized

    # Build list with counts, sorted by count descending then name
    contributors = [(canonical[key], count) for key, count in counts.items()]
    contributors.sort(key=lambda x: (-x[1], x[0].lower()))
    return contributors


def update_credits(
    maintainers: list[tuple[str, int, dict[str, int]]],
    contributors: list[tuple[str, int]],
) -> None:
    """Update the credits.md file with maintainers and contributors."""
    content = CREDITS_FILE.read_text(encoding="utf-8")

    # Build maintainers section (GitHub usernames with profile links)
    maintainer_lines = []
    for login, merges, push_cats in maintainers:
        total_pushes = sum(push_cats.values())
        if total_pushes > 0:
            # Build categorized push breakdown
            push_parts = []
            if push_cats.get("ci", 0) > 0:
                push_parts.append(f"{push_cats['ci']} ci")
            if push_cats.get("docs only", 0) > 0:
                push_parts.append(f"{push_cats['docs only']} docs only")
            if push_cats.get("docs", 0) > 0:
                push_parts.append(f"{push_cats['docs']} docs")
            if push_cats.get("other", 0) > 0:
                push_parts.append(f"{push_cats['other']} other")
            push_str = ", ".join(push_parts)
            line = f"- [@{login}](https://github.com/{login}) ({merges} merges, {total_pushes} direct changes: {push_str})"
        else:
            line = f"- [@{login}](https://github.com/{login}) ({merges} merges)"
        maintainer_lines.append(line)

    maintainer_section = (
        "\n".join(maintainer_lines)
        if maintainer_lines
        else "_No maintainers detected._"
    )

    # Build contributors section with commit counts
    # Sanitize names to avoid MDX interpreting special characters (like {}) as JS
    contributor_lines = [
        f"{sanitize_name(name)} ({count})" for name, count in contributors
    ]
    contributor_section = (
        ", ".join(contributor_lines)
        if contributor_lines
        else "_No contributors detected._"
    )
    timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC")
    contributor_section = f"{len(contributors)} contributors: {contributor_section}\n\n_Last updated: {timestamp}_"

    # Replace sections by finding markers and rebuilding
    lines = content.split("\n")
    result = []
    skip_until_next_section = False
    i = 0

    while i < len(lines):
        line = lines[i]

        if line == "## Maintainers":
            result.append(line)
            result.append("")
            result.append(maintainer_section)
            skip_until_next_section = True
            i += 1
            continue

        if line == "## Contributors":
            result.append("")
            result.append(line)
            result.append("")
            result.append(contributor_section)
            skip_until_next_section = True
            i += 1
            continue

        # Check if we hit the next section
        if skip_until_next_section and (
            line.startswith("## ") or line.startswith("> ")
        ):
            skip_until_next_section = False
            result.append("")  # blank line before next section

        if not skip_until_next_section:
            result.append(line)

        i += 1

    content = "\n".join(result)
    CREDITS_FILE.write_text(content, encoding="utf-8")
    print(f"Updated {CREDITS_FILE}")
    print(f"  Maintainers: {len(maintainers)}")
    print(f"  Contributors: {len(contributors)}")


def main() -> None:
    print("Syncing credits from git/GitHub...")
    maintainers = get_maintainers()
    contributors = get_contributors()
    update_credits(maintainers, contributors)


if __name__ == "__main__":
    main()