Replace bash scripts with Python for git operations (#9914)

Co-authored-by: openhands <openhands@all-hands.dev>
This commit is contained in:
Tim O'Farrell
2025-07-29 07:34:52 -06:00
committed by GitHub
parent 8fb3728391
commit d9a595c9b1
6 changed files with 611 additions and 508 deletions

View File

@@ -0,0 +1,195 @@
#!/usr/bin/env python3
"""
Get git changes in the current working directory relative to the remote origin if possible.
NOTE: Since this is run as a script, there should be no imports from project files!
"""
import glob
import json
import os
import subprocess
from pathlib import Path
def run(cmd: str, cwd: str) -> str:
result = subprocess.run(
args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd
)
byte_content = result.stderr or result.stdout or b''
if result.returncode != 0:
raise RuntimeError(
f'error_running_cmd:{result.returncode}:{byte_content.decode()}'
)
return byte_content.decode().strip()
def get_valid_ref(repo_dir: str) -> str | None:
try:
current_branch = run('git --no-pager rev-parse --abbrev-ref HEAD', repo_dir)
except RuntimeError:
# Not a git repository (Or no commits)
return None
try:
default_branch = (
run('git --no-pager remote show origin | grep "HEAD branch"', repo_dir)
.split()[-1]
.strip()
)
except RuntimeError:
# Git repository does not have a remote origin - use current
return current_branch
ref_current_branch = f'origin/{current_branch}'
ref_non_default_branch = f'$(git --no-pager merge-base HEAD "$(git --no-pager rev-parse --abbrev-ref origin/{default_branch})")'
ref_default_branch = f'origin/{default_branch}'
ref_new_repo = '$(git --no-pager rev-parse --verify 4b825dc642cb6eb9a060e54bf8d69288fbee4904)' # compares with empty tree
refs = [
ref_current_branch,
ref_non_default_branch,
ref_default_branch,
ref_new_repo,
]
# Find a ref that exists...
for ref in refs:
try:
result = run(f'git --no-pager rev-parse --verify {ref}', repo_dir)
return result
except RuntimeError:
# invalid ref - try next
continue
return None
def get_changes_in_repo(repo_dir: str) -> list[dict[str, str]]:
# Gets the status relative to the origin default branch - not the same as `git status`
ref = get_valid_ref(repo_dir)
if not ref:
return []
# Get changed files
changed_files = run(
f'git --no-pager diff --name-status {ref}', repo_dir
).splitlines()
changes = []
for line in changed_files:
if not line.strip():
raise RuntimeError(f'unexpected_value_in_git_diff:{changed_files}')
# Handle different output formats from git diff --name-status
# Depending on git config, format can be either:
# * "A file.txt"
# * "A file.txt"
# * "R100 old_file.txt new_file.txt" (rename with similarity percentage)
parts = line.split()
if len(parts) < 2:
raise RuntimeError(f'unexpected_value_in_git_diff:{changed_files}')
status = parts[0].strip()
# Handle rename operations (status starts with 'R' followed by similarity percentage)
if status.startswith('R') and len(parts) == 3:
# Rename: convert to delete (old path) + add (new path)
old_path = parts[1].strip()
new_path = parts[2].strip()
changes.append(
{
'status': 'D',
'path': old_path,
}
)
changes.append(
{
'status': 'A',
'path': new_path,
}
)
continue
# Handle copy operations (status starts with 'C' followed by similarity percentage)
elif status.startswith('C') and len(parts) == 3:
# Copy: only add the new path (original remains)
new_path = parts[2].strip()
changes.append(
{
'status': 'A',
'path': new_path,
}
)
continue
# Handle regular operations (M, A, D, etc.)
elif len(parts) == 2:
path = parts[1].strip()
else:
raise RuntimeError(f'unexpected_value_in_git_diff:{changed_files}')
if status == '??':
status = 'A'
elif status == '*':
status = 'M'
# Check for valid single-character status codes
if status in {'M', 'A', 'D', 'U'}:
changes.append(
{
'status': status,
'path': path,
}
)
else:
raise RuntimeError(f'unexpected_status_in_git_diff:{changed_files}')
# Get untracked files
untracked_files = run(
'git --no-pager ls-files --others --exclude-standard', repo_dir
).splitlines()
for path in untracked_files:
if path:
changes.append({'status': 'A', 'path': path})
return changes
def get_git_changes(cwd: str) -> list[dict[str, str]]:
git_dirs = {
os.path.dirname(f)[2:]
for f in glob.glob('./*/.git', root_dir=cwd, recursive=True)
}
# First try the workspace directory
changes = get_changes_in_repo(cwd)
# Filter out any changes which are in one of the git directories
changes = [
change
for change in changes
if next(
iter(git_dir for git_dir in git_dirs if change['path'].startswith(git_dir)),
None,
)
is None
]
# Add changes from git directories
for git_dir in git_dirs:
git_dir_changes = get_changes_in_repo(str(Path(cwd, git_dir)))
for change in git_dir_changes:
change['path'] = git_dir + '/' + change['path']
changes.append(change)
changes.sort(key=lambda change: change['path'])
return changes
if __name__ == '__main__':
try:
changes = get_git_changes(os.getcwd())
print(json.dumps(changes))
except Exception as e:
print(json.dumps({'error': str(e)}))

View File

@@ -0,0 +1,103 @@
#!/usr/bin/env python3
"""
Get git diff in a single git file for the closest git repo in the file system
NOTE: Since this is run as a script, there should be no imports from project files!
"""
import json
import os
import subprocess
import sys
from pathlib import Path
def get_closest_git_repo(path: Path) -> Path | None:
while True:
path = path.parent
git_path = Path(path, '.git')
if git_path.is_dir():
return path
if path.parent == path:
return None
def run(cmd: str, cwd: str) -> str:
result = subprocess.run(
args=cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, cwd=cwd
)
byte_content = result.stderr or result.stdout or b''
if result.returncode != 0:
raise RuntimeError(
f'error_running_cmd:{result.returncode}:{byte_content.decode()}'
)
return byte_content.decode().strip()
def get_valid_ref(repo_dir: str) -> str | None:
try:
current_branch = run('git --no-pager rev-parse --abbrev-ref HEAD', repo_dir)
except RuntimeError:
# Not a git repository (Or no commits)
return None
try:
default_branch = (
run('git --no-pager remote show origin | grep "HEAD branch"', repo_dir)
.split()[-1]
.strip()
)
except RuntimeError:
# Git repository does not have a remote origin - use current
return current_branch
ref_current_branch = f'origin/{current_branch}'
ref_non_default_branch = f'$(git --no-pager merge-base HEAD "$(git --no-pager rev-parse --abbrev-ref origin/{default_branch})")'
ref_default_branch = 'origin/' + default_branch
ref_new_repo = '$(git --no-pager rev-parse --verify 4b825dc642cb6eb9a060e54bf8d69288fbee4904)' # compares with empty tree
refs = [
ref_current_branch,
ref_non_default_branch,
ref_default_branch,
ref_new_repo,
]
# Find a ref that exists...
for ref in refs:
try:
run(f'git --no-pager rev-parse --verify {ref}', repo_dir)
return ref
except RuntimeError:
# invalid ref - try next
continue
return None
def get_git_diff(relative_file_path: str) -> dict[str, str]:
path = Path(os.getcwd(), relative_file_path).resolve()
closest_git_repo = get_closest_git_repo(path)
if not closest_git_repo:
raise ValueError('no_repo')
current_rev = get_valid_ref(str(closest_git_repo))
try:
original = run(
f'git show "{current_rev}:{path.relative_to(closest_git_repo)}"',
str(closest_git_repo),
)
except RuntimeError:
original = ''
try:
with open(path, 'r') as f:
modified = '\n'.join(f.read().splitlines())
except FileNotFoundError:
modified = ''
return {
'modified': modified,
'original': original,
}
if __name__ == '__main__':
diff = get_git_diff(sys.argv[-1])
print(json.dumps(diff))

View File

@@ -1,6 +1,15 @@
import json
from dataclasses import dataclass
from pathlib import Path
from typing import Callable
from uuid import uuid4
from openhands.core.logger import openhands_logger as logger
from openhands.runtime.utils import git_changes, git_diff
GIT_CHANGES_CMD = 'python3 /openhands/code/openhands/runtime/utils/git_changes.py'
GIT_DIFF_CMD = (
'python3 /openhands/code/openhands/runtime/utils/git_diff.py "{file_path}"'
)
@dataclass
@@ -25,9 +34,13 @@ class GitHandler:
def __init__(
self,
execute_shell_fn: Callable[[str, str | None], CommandResult],
create_file_fn: Callable[[str, str], int],
):
self.execute = execute_shell_fn
self.create_file_fn = create_file_fn
self.cwd: str | None = None
self.git_changes_cmd = GIT_CHANGES_CMD
self.git_diff_cmd = GIT_DIFF_CMD
def set_cwd(self, cwd: str) -> None:
"""
@@ -38,148 +51,13 @@ class GitHandler:
"""
self.cwd = cwd
def _is_git_repo(self) -> bool:
"""
Checks if the current directory is a Git repository.
Returns:
bool: True if inside a Git repository, otherwise False.
"""
cmd = 'git --no-pager rev-parse --is-inside-work-tree'
output = self.execute(cmd, self.cwd)
return output.content.strip() == 'true'
def _get_current_file_content(self, file_path: str) -> str:
"""
Retrieves the current content of a given file.
Args:
file_path (str): Path to the file.
Returns:
str: The file content.
"""
output = self.execute(f'cat {file_path}', self.cwd)
return output.content
def _verify_ref_exists(self, ref: str) -> bool:
"""
Verifies whether a specific Git reference exists.
Args:
ref (str): The Git reference to check.
Returns:
bool: True if the reference exists, otherwise False.
"""
cmd = f'git --no-pager rev-parse --verify {ref}'
output = self.execute(cmd, self.cwd)
return output.exit_code == 0
def _get_ref_content(self, file_path: str) -> str:
"""
Retrieves the content of a file from a valid Git reference.
Finds the git repository closest to the file in the tree and executes the command in that context.
Args:
file_path (str): The file path in the repository.
Returns:
str: The content of the file from the reference, or an empty string if unavailable.
"""
if not self.cwd:
return ''
unique_id = uuid4().hex
# Single bash command that finds the closest git repository to the file and gets the ref content
cmd = f"""bash -c '
# Convert to absolute path
file_path="$(realpath "{file_path}")"
# Find the closest git repository by walking up the directory tree
current_dir="$(dirname "$file_path")"
git_repo_dir=""
while [[ "$current_dir" != "/" ]]; do
if [[ -d "$current_dir/.git" ]] || git -C "$current_dir" rev-parse --git-dir >/dev/null 2>&1; then
git_repo_dir="$current_dir"
break
fi
current_dir="$(dirname "$current_dir")"
done
# If no git repository found, exit
if [[ -z "$git_repo_dir" ]]; then
exit 1
fi
# Get the file path relative to the git repository root
repo_root="$(cd "$git_repo_dir" && git rev-parse --show-toplevel)"
relative_file_path="${{file_path#${{repo_root}}/}}"
# Function to get current branch
get_current_branch() {{
git -C "$git_repo_dir" rev-parse --abbrev-ref HEAD 2>/dev/null
}}
# Function to get default branch
get_default_branch() {{
git -C "$git_repo_dir" remote show origin 2>/dev/null | grep "HEAD branch" | awk "{{print \\$NF}}" || echo "main"
}}
# Function to verify if a ref exists
verify_ref_exists() {{
git -C "$git_repo_dir" rev-parse --verify "$1" >/dev/null 2>&1
}}
# Get valid reference for comparison
current_branch="$(get_current_branch)"
default_branch="$(get_default_branch)"
# Check if origin remote exists
has_origin="$(git -C "$git_repo_dir" remote | grep -q "^origin$" && echo "true" || echo "false")"
if [[ "$has_origin" == "true" ]]; then
ref_current_branch="origin/$current_branch"
ref_non_default_branch="$(git -C "$git_repo_dir" merge-base HEAD "$(git -C "$git_repo_dir" rev-parse --abbrev-ref origin/$default_branch)" 2>/dev/null || echo "")"
ref_default_branch="origin/$default_branch"
else
# For repositories without origin, try HEAD~1 (previous commit) or empty tree
ref_current_branch="HEAD~1"
ref_non_default_branch=""
ref_default_branch=""
fi
ref_new_repo="$(git -C "$git_repo_dir" rev-parse --verify 4b825dc642cb6eb9a060e54bf8d69288fbee4904 2>/dev/null || echo "")" # empty tree
# Try refs in order of preference
valid_ref=""
for ref in "$ref_current_branch" "$ref_non_default_branch" "$ref_default_branch" "$ref_new_repo"; do
if [[ -n "$ref" ]] && verify_ref_exists "$ref"; then
valid_ref="$ref"
break
fi
done
# If no valid ref found, exit
if [[ -z "$valid_ref" ]]; then
exit 1
fi
# Get the file content from the reference
git -C "$git_repo_dir" show "$valid_ref:$relative_file_path" 2>/dev/null || exit 1
# {unique_id}'"""
result = self.execute(cmd, self.cwd)
if result.exit_code != 0:
return ''
# TODO: The command echoes the bash script. Why?
content = result.content.split(f'{unique_id}')[-1]
return content
def _create_python_script_file(self, file: str):
result = self.execute('mktemp -d', self.cwd)
script_file = Path(result.content.strip(), Path(file).name)
with open(file, 'r') as f:
self.create_file_fn(str(script_file), f.read())
result = self.execute(f'chmod +x "{script_file}"', self.cwd)
return script_file
def get_git_changes(self) -> list[dict[str, str]] | None:
"""
@@ -195,57 +73,31 @@ class GitHandler:
if not self.cwd:
return None
# Single bash command that:
# 1. Creates a list of directories to check (current dir + direct subdirectories)
# 2. For each directory, checks if it's a git repo and gets status
# 3. Outputs in format: REPO_PATH|STATUS|FILE_PATH
cmd = """bash -c '
{
# Check current directory first
echo "."
# List direct subdirectories (excluding hidden ones)
find . -maxdepth 1 -type d ! -name ".*" ! -name "." 2>/dev/null || true
} | while IFS= read -r dir; do
if [ -d "$dir/.git" ] || git -C "$dir" rev-parse --git-dir >/dev/null 2>&1; then
# Get absolute path of the directory
# Get git status for this repository
git -C "$dir" status --porcelain -uall 2>/dev/null | while IFS= read -r line; do
if [ -n "$line" ]; then
# Extract status (first 2 chars) and file path (from char 3 onwards)
status=$(echo "$line" | cut -c1-2)
file_path=$(echo "$line" | cut -c4-)
# Convert status codes to single character
case "$status" in
"M "*|" M") echo "$dir|M|$file_path" ;;
"A "*|" A") echo "$dir|A|$file_path" ;;
"D "*|" D") echo "$dir|D|$file_path" ;;
"R "*|" R") echo "$dir|R|$file_path" ;;
"C "*|" C") echo "$dir|C|$file_path" ;;
"U "*|" U") echo "$dir|U|$file_path" ;;
"??") echo "$dir|A|$file_path" ;;
*) echo "$dir|M|$file_path" ;;
esac
fi
done
fi
done
' """
result = self.execute(self.git_changes_cmd, self.cwd)
if result.exit_code == 0:
try:
changes = json.loads(result.content)
return changes
except Exception:
logger.exception(
'GitHandler:get_git_changes:error',
extra={'content': result.content},
)
return None
result = self.execute(cmd.strip(), self.cwd)
if result.exit_code != 0 or not result.content.strip():
if self.git_changes_cmd != GIT_CHANGES_CMD:
# We have already tried to add a script to the workspace - it did not work
return None
# Parse the output
changes = []
for line in result.content.strip().split('\n'):
if '|' in line:
parts = line.split('|', 2)
if len(parts) == 3:
repo_path, status, file_path = parts
file_path = f'{repo_path}/{file_path}'[2:]
changes.append({'status': status, 'path': file_path})
# We try to add a script for getting git changes to the runtime - legacy runtimes may be missing the script
logger.info(
'GitHandler:get_git_changes: adding git_changes script to runtime...'
)
script_file = self._create_python_script_file(git_changes.__file__)
self.git_changes_cmd = f'python3 {script_file}'
return changes if changes else None
# Try again with the new changes cmd
return self.get_git_changes()
def get_git_diff(self, file_path: str) -> dict[str, str]:
"""
@@ -257,36 +109,23 @@ class GitHandler:
Returns:
dict[str, str]: A dictionary containing the original and modified content.
"""
modified = self._get_current_file_content(file_path)
original = self._get_ref_content(file_path)
# If cwd is not set, return None
if not self.cwd:
raise ValueError('no_dir_in_git_diff')
return {
'modified': modified,
'original': original,
}
result = self.execute(self.git_diff_cmd.format(file_path=file_path), self.cwd)
if result.exit_code == 0:
diff = json.loads(result.content)
return diff
if self.git_diff_cmd != GIT_DIFF_CMD:
# We have already tried to add a script to the workspace - it did not work
raise ValueError('error_in_git_diff')
def parse_git_changes(changes_list: list[str]) -> list[dict[str, str]]:
"""
Parses the list of changed files and extracts their statuses and paths.
# We try to add a script for getting git changes to the runtime - legacy runtimes may be missing the script
logger.info('GitHandler:get_git_diff: adding git_diff script to runtime...')
script_file = self._create_python_script_file(git_diff.__file__)
self.git_diff_cmd = f'python3 {script_file} "{{file_path}}"'
Args:
changes_list (list[str]): List of changed file entries.
Returns:
list[dict[str, str]]: Parsed list of file changes with statuses.
"""
result = []
for line in changes_list:
status = line[:2].strip()
path = line[2:].strip()
# Get the first non-space character as the primary status
primary_status = status.replace(' ', '')[0]
result.append(
{
'status': primary_status,
'path': path,
}
)
return result
# Try again with the new changes cmd
return self.get_git_diff(file_path)