Fix style issues with pre-commit (#7318)

Co-authored-by: openhands <openhands@all-hands.dev>
2026-01-09 14:57:59 -05:00 · 2025-03-18 02:34:27 +01:00
parent f1149defc9
commit 83458f5146
8 changed files with 55 additions and 50 deletions
--- a/evaluation/benchmarks/testgeneval/compute_readability.py
+++ b/evaluation/benchmarks/testgeneval/compute_readability.py
@@ -1,8 +1,4 @@
 import math
 import os
 from pathlib import Path
 from tree_sitter import Language, Parser
 def total_byte_entropy_stats(python_code):
@@ -324,8 +320,8 @@ def compute_regression(results):
 def compute_readability(python_code):
    # Create parser and set up language
    import tree_sitter_python
-    from tree_sitter import Parser, Language
+    from tree_sitter import Language, Parser
-    
+
    parser = Parser(Language(tree_sitter_python.language()))
    results = code_stats(python_code)
--- a/evaluation/benchmarks/testgeneval/metrics.py
+++ b/evaluation/benchmarks/testgeneval/metrics.py
@@ -6,12 +6,11 @@ import numpy as np
 from fuzzywuzzy import fuzz
 from rouge import Rouge
 # increase recursion depth to ensure ROUGE can be calculated for long sentences
 if sys.getrecursionlimit() < 10_000:
    sys.setrecursionlimit(10_000)
 def bleu(gold: List[str], pred: List[str]) -> float:
    """
    Calculate BLEU score, using smoothing method 2 with auto reweighting, in the range of 0~100.
@@ -39,7 +38,7 @@ def batch_bleu(golds: List[List[str]], preds: List[List[str]]) -> List[float]:
    :return: list of BLEU scores
    """
    if len(golds) != len(preds):
-        raise ValueError("golds and preds must have the same length")
+        raise ValueError('golds and preds must have the same length')
    return [bleu(gold, pred) for gold, pred in zip(golds, preds)]
@@ -52,7 +51,7 @@ def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:
    :return: corpus-level BLEU score
    """
    if len(golds) != len(preds):
-        raise ValueError("golds and preds must have the same length")
+        raise ValueError('golds and preds must have the same length')
    return 100.0 * nltk.translate.bleu_score.corpus_bleu(
        [[gold] for gold in golds],
        preds,
@@ -62,7 +61,7 @@ def corpus_bleu(golds: List[List[str]], preds: List[List[str]]) -> float:
 def edit_sim(
-    gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = " "
+    gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = ' '
 ) -> float:
    """
    Calculate char-level edit similarity, in the range of 0~100.
@@ -84,7 +83,7 @@ def edit_sim(
 def batch_edit_sim(
    golds: List[Union[str, List[str]]],
    preds: List[Union[str, List[str]]],
-    sep: str = " ",
+    sep: str = ' ',
 ) -> List[float]:
    """
    Calculate char-level edit similarity for a batch of sentences.
@@ -95,11 +94,11 @@ def batch_edit_sim(
    :return: list of char-level edit similarity
    """
    if len(golds) != len(preds):
-        raise ValueError("golds and preds must have the same length")
+        raise ValueError('golds and preds must have the same length')
    return [edit_sim(gold, pred, sep) for gold, pred in zip(golds, preds)]
-T = TypeVar("T")
+T = TypeVar('T')
 def exact_match(gold: T, pred: T) -> float:
@@ -124,12 +123,12 @@ def batch_exact_match(golds: List[T], preds: List[T]) -> List[float]:
    :return: list of exact match accuracy
    """
    if len(golds) != len(preds):
-        raise ValueError("golds and preds must have the same length")
+        raise ValueError('golds and preds must have the same length')
    return [exact_match(gold, pred) for gold, pred in zip(golds, preds)]
 def rouge_l(
-    gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = " "
+    gold: Union[str, List[str]], pred: Union[str, List[str]], sep: str = ' '
 ) -> Dict[str, float]:
    """
    Calculate ROUGE-L F1, precision, and recall scores, in the range of 0~100.
@@ -139,7 +138,7 @@ def rouge_l(
    :return: {"p": precision, "r": recall, "f": F1}
    """
    if len(pred) == 0 or len(gold) == 0:
-        return {"p": 0.0, "r": 0.0, "f": 0.0}
+        return {'p': 0.0, 'r': 0.0, 'f': 0.0}
    if isinstance(gold, list):
        gold = sep.join(gold)
    if isinstance(pred, list):
@@ -147,15 +146,15 @@ def rouge_l(
    try:
        rouge = Rouge()
        scores = rouge.get_scores(hyps=pred, refs=gold, avg=True)
-        return {x: scores["rouge-l"][x] * 100.0 for x in ["p", "r", "f"]}
+        return {x: scores['rouge-l'][x] * 100.0 for x in ['p', 'r', 'f']}
    except ValueError:
-        return {"p": 0.0, "r": 0.0, "f": 0.0}
+        return {'p': 0.0, 'r': 0.0, 'f': 0.0}
 def batch_rouge_l(
    golds: List[Union[str, List[str]]],
    preds: List[Union[str, List[str]]],
-    sep: str = " ",
+    sep: str = ' ',
 ) -> Dict[str, List[float]]:
    """
    Calculate ROUGE-L F1, precision, and recall scores for a batch of sentences.
@@ -166,9 +165,9 @@ def batch_rouge_l(
    :return: list of {"p": precision, "r": recall, "f": F1}
    """
    if len(golds) != len(preds):
-        raise ValueError("golds and preds must have the same length")
+        raise ValueError('golds and preds must have the same length')
    scores = [rouge_l(gold, pred, sep) for gold, pred in zip(golds, preds)]
-    return {x: [score[x] for score in scores] for x in ["p", "r", "f"]}
+    return {x: [score[x] for score in scores] for x in ['p', 'r', 'f']}
 def accuracy(
@@ -220,7 +219,7 @@ def batch_accuracy(
    :return: list of accuracy
    """
    if len(golds) != len(preds):
-        raise ValueError("golds and preds must have the same length")
+        raise ValueError('golds and preds must have the same length')
    return [accuracy(gold, pred, ignore) for gold, pred in zip(golds, preds)]
@@ -274,7 +273,7 @@ def self_bleu(samples: List[List[str]]) -> float:
    return np.mean(scores).item()
-def self_edit_distance(samples: List[Union[str, List[str]]], sep=" ") -> float:
+def self_edit_distance(samples: List[Union[str, List[str]]], sep=' ') -> float:
    """
    Calculate self-edit-distance among the samples.
    :param samples: the chosen m samples
@@ -300,12 +299,11 @@ def self_edit_distance(samples: List[Union[str, List[str]]], sep=" ") -> float:
    return np.mean(scores).item()
 QUALITY_METRICS: Dict[str, Callable[[List[str], List[str]], float]] = {
-    "bleu": bleu,
+    'bleu': bleu,
-    "xmatch": exact_match,
+    'xmatch': exact_match,
-    "edit-sim": edit_sim,
+    'edit-sim': edit_sim,
-    "rouge-f": lambda g, p: rouge_l(g, p)["f"],
+    'rouge-f': lambda g, p: rouge_l(g, p)['f'],
-    "rouge-p": lambda g, p: rouge_l(g, p)["p"],
+    'rouge-p': lambda g, p: rouge_l(g, p)['p'],
-    "rouge-r": lambda g, p: rouge_l(g, p)["r"],
+    'rouge-r': lambda g, p: rouge_l(g, p)['r'],
-}
+}
--- a/evaluation/benchmarks/testgeneval/pygments_utils.py
+++ b/evaluation/benchmarks/testgeneval/pygments_utils.py
@@ -1,30 +1,41 @@
 import re
 from pygments.lexers.python import PythonLexer
 def tokenize_code(code):
    lexer = PythonLexer()
    tokens = process_pygments_tokens(lexer.get_tokens(code))
    return tokens
 def process_pygments_tokens(tokens):
    new_tokens = []
    for token in tokens:
-        if str(token[0]) == "Token.Text" and re.match(r'\s+', token[1]) or str(token[0]) == "Token.Text.Whitespace":
+        if (
            str(token[0]) == 'Token.Text'
            and re.match(r'\s+', token[1])
            or str(token[0]) == 'Token.Text.Whitespace'
        ):
            continue
        new_tokens.append(token[1])
    new_tokens_final = []
    i = 0
-    while i < len(new_tokens)-2:
+    while i < len(new_tokens) - 2:
-        if new_tokens[i] == '"' and new_tokens[i+1]=='STR' and new_tokens[i+2] == '"':
+        if (
-            new_tokens_final.append("\"STR\"")
+            new_tokens[i] == '"'
            and new_tokens[i + 1] == 'STR'
            and new_tokens[i + 2] == '"'
        ):
            new_tokens_final.append('"STR"')
            i = i + 3
        else:
            new_tokens_final.append(new_tokens[i])
            i = i + 1
-    
+
-    for i in range(len(new_tokens)-2, len(new_tokens)):
+    for i in range(len(new_tokens) - 2, len(new_tokens)):
        if i >= 0:
            new_tokens_final.append(new_tokens[i])
--- a/evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_output_to_md.py
+++ b/evaluation/benchmarks/testgeneval/scripts/eval/convert_oh_output_to_md.py
@@ -8,7 +8,6 @@ import os
 import pandas as pd
 from tqdm import tqdm
 from evaluation.testgeneval.eval_infer import process_test_suite
 from openhands.events.serialization import event_from_dict
 tqdm.pandas()
--- a/evaluation/benchmarks/testgeneval/scripts/eval/download_gold_test_suites.py
+++ b/evaluation/benchmarks/testgeneval/scripts/eval/download_gold_test_suites.py
@@ -20,7 +20,8 @@ print(
    f'Downloading gold test suites from {args.dataset_name} (split: {args.split}) to {output_filepath}'
 )
 test_suites = [
-    {'instance_id': row['instance_id'], 'test_suite': row['test_src']} for row in dataset
+    {'instance_id': row['instance_id'], 'test_suite': row['test_src']}
    for row in dataset
 ]
 print(f'{len(test_suites)} test suites loaded')
 pd.DataFrame(test_suites).to_json(output_filepath, lines=True, orient='records')
--- a/evaluation/benchmarks/testgeneval/scripts/eval/summarize_outputs.py
+++ b/evaluation/benchmarks/testgeneval/scripts/eval/summarize_outputs.py
@@ -90,9 +90,7 @@ if __name__ == '__main__':
                break
    # print the error counter (with percentage)
-    print(
+    print(f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)')
        f'Average coverage for {num_lines} ({coverage / num_lines * 100:.2f}%)'
    )
    print(
        f'Average mutation score for {num_lines} ({mutation_score / num_lines * 100:.2f}%)'
    )
--- a/frontend/tests/services/actions.test.ts
+++ b/frontend/tests/services/actions.test.ts
@@ -79,7 +79,7 @@ describe("Actions Service", () => {
      // Mock implementation to capture the message
      let capturedPartialMessage = "";
      (store.dispatch as any).mockImplementation((action: any) => {
-        if (action.type === "chat/addAssistantMessage" && 
+        if (action.type === "chat/addAssistantMessage" &&
            action.payload.includes("believe that the task was **completed partially**")) {
          capturedPartialMessage = action.payload;
        }
@@ -87,7 +87,7 @@ describe("Actions Service", () => {
      handleActionMessage(messagePartial);
      expect(capturedPartialMessage).toContain("I believe that the task was **completed partially**");
-      
+
      // Test not completed
      const messageNotCompleted: ActionMessage = {
        id: 2,
@@ -106,7 +106,7 @@ describe("Actions Service", () => {
      // Mock implementation to capture the message
      let capturedNotCompletedMessage = "";
      (store.dispatch as any).mockImplementation((action: any) => {
-        if (action.type === "chat/addAssistantMessage" && 
+        if (action.type === "chat/addAssistantMessage" &&
            action.payload.includes("believe that the task was **not completed**")) {
          capturedNotCompletedMessage = action.payload;
        }
@@ -114,7 +114,7 @@ describe("Actions Service", () => {
      handleActionMessage(messageNotCompleted);
      expect(capturedNotCompletedMessage).toContain("I believe that the task was **not completed**");
-      
+
      // Test completed successfully
      const messageCompleted: ActionMessage = {
        id: 3,
@@ -133,7 +133,7 @@ describe("Actions Service", () => {
      // Mock implementation to capture the message
      let capturedCompletedMessage = "";
      (store.dispatch as any).mockImplementation((action: any) => {
-        if (action.type === "chat/addAssistantMessage" && 
+        if (action.type === "chat/addAssistantMessage" &&
            action.payload.includes("believe that the task was **completed successfully**")) {
          capturedCompletedMessage = action.payload;
        }
--- a/openhands/server/routes/github.py
+++ b/openhands/server/routes/github.py
@@ -65,7 +65,9 @@ async def get_github_user(
    access_token: SecretStr | None = Depends(get_access_token),
 ):
    if provider_tokens:
-        client = ProviderHandler(provider_tokens=provider_tokens, external_auth_token=access_token)
+        client = ProviderHandler(
            provider_tokens=provider_tokens, external_auth_token=access_token
        )
        try:
            user: User = await client.get_user()
@@ -164,7 +166,7 @@ async def search_github_repositories(
@app.get('/suggested-tasks', response_model=list[SuggestedTask])
 async def get_suggested_tasks(
    provider_tokens: PROVIDER_TOKEN_TYPE | None = Depends(get_provider_tokens),
-    access_token: SecretStr | None = Depends(get_access_token)
+    access_token: SecretStr | None = Depends(get_access_token),
 ):
    """Get suggested tasks for the authenticated user across their most recently pushed repositories.