migrate to use OH version

Merge branch 'main' into eval/visualcodebench
feat: adapt Design2Code block detection for in-memory evaluation
2026-04-29 03:00:45 -04:00 · 2025-01-26 15:24:35 -05:00 · 2025-01-26 15:14:28 -05:00 · 2024-11-30 19:28:22 +00:00 · 2024-11-30 14:00:25 -05:00 · 2024-11-30 17:17:05 +00:00
7 changed files with 1246 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -176,6 +176,7 @@ evaluation/gorilla/data
 evaluation/toolqa/data
 evaluation/scienceagentbench/benchmark
 evaluation/commit0_bench/repos
 evaluation/visualcodebench/
 # openhands resolver
 output/
--- a/evaluation/benchmarks/visualcodebench/eval.py
+++ b/evaluation/benchmarks/visualcodebench/eval.py
@@ -0,0 +1,674 @@
 from collections import Counter
 from copy import deepcopy
 from difflib import SequenceMatcher
 from io import BytesIO
 from bs4 import BeautifulSoup, Comment, NavigableString, Tag
 import cv2
 import numpy as np
 import torch
 from colormath.color_conversions import convert_color
 from colormath.color_diff import delta_e_cie2000
 from colormath.color_objects import LabColor, sRGBColor
 from PIL import Image, ImageChops, ImageColor
 from scipy.optimize import linear_sum_assignment
 from transformers import CLIPModel, CLIPProcessor
 from openhands.core.logger import openhands_logger as logger
 def calculate_similarity(block1, block2):
    """Calculate text similarity between two blocks using SequenceMatcher."""
    text_similarity = SequenceMatcher(None, block1['text'], block2['text']).ratio()
    return text_similarity
 def adjust_cost_for_context(cost_matrix, consecutive_bonus=1.0, window_size=20):
    """Adjust cost matrix by considering context similarity."""
    if window_size <= 0:
        return cost_matrix
    n, m = cost_matrix.shape
    adjusted_cost_matrix = np.copy(cost_matrix)
    for i in range(n):
        for j in range(m):
            if adjusted_cost_matrix[i][j] >= -0.5:
                continue
            nearby_matrix = cost_matrix[
                max(0, i - window_size) : min(n, i + window_size + 1),
                max(0, j - window_size) : min(m, j + window_size + 1),
            ]
            flattened_array = nearby_matrix.flatten()
            sorted_array = np.sort(flattened_array)[::-1]
            sorted_array = np.delete(
                sorted_array, np.where(sorted_array == cost_matrix[i, j])[0][0]
            )
            top_k_elements = sorted_array[-window_size * 2 :]
            bonus = consecutive_bonus * np.sum(top_k_elements)
            adjusted_cost_matrix[i][j] += bonus
    return adjusted_cost_matrix
 def create_cost_matrix(A, B):
    """Create cost matrix for block matching."""
    n = len(A)
    m = len(B)
    cost_matrix = np.zeros((n, m))
    for i in range(n):
        for j in range(m):
            cost_matrix[i, j] = -calculate_similarity(A[i], B[j])
    return cost_matrix
 def calculate_distance_max_1d(x1, y1, x2, y2):
    """Calculate maximum 1D distance between points."""
    return max(abs(x2 - x1), abs(y2 - y1))
 def calculate_ratio(h1, h2):
    """Calculate ratio between two heights."""
    return max(h1, h2) / min(h1, h2)
 def rgb_to_lab(rgb):
    """Convert RGB color to Lab color space."""
    rgb_color = sRGBColor(rgb[0], rgb[1], rgb[2], is_upscaled=True)
    lab_color = convert_color(rgb_color, LabColor)
    return lab_color
 def color_similarity_ciede2000(rgb1, rgb2):
    """Calculate color similarity using CIEDE2000 formula."""
    lab1 = rgb_to_lab(rgb1)
    lab2 = rgb_to_lab(rgb2)
    delta_e = delta_e_cie2000(lab1, lab2)
    similarity = max(0, 1 - (delta_e / 100))
    return similarity
 def merge_blocks_wo_check(block1, block2):
    """Merge two blocks without additional checks."""
    merged_text = block1['text'] + ' ' + block2['text']
    x_min = min(block1['bbox'][0], block2['bbox'][0])
    y_min = min(block1['bbox'][1], block2['bbox'][1])
    x_max = max(
        block1['bbox'][0] + block1['bbox'][2], block2['bbox'][0] + block2['bbox'][2]
    )
    y_max = max(
        block1['bbox'][1] + block1['bbox'][3], block2['bbox'][1] + block2['bbox'][3]
    )
    merged_bbox = (x_min, y_min, x_max - x_min, y_max - y_min)
    merged_color = tuple(
        (color1 + color2) // 2
        for color1, color2 in zip(block1['color'], block2['color'])
    )
    return {'text': merged_text, 'bbox': merged_bbox, 'color': merged_color}
 def find_maximum_matching(A, B, consecutive_bonus, window_size):
    """Find maximum matching between two sets of blocks."""
    cost_matrix = create_cost_matrix(A, B)
    cost_matrix = adjust_cost_for_context(cost_matrix, consecutive_bonus, window_size)
    row_ind, col_ind = linear_sum_assignment(cost_matrix)
    current_cost = cost_matrix[row_ind, col_ind].tolist()
    return list(zip(row_ind, col_ind)), current_cost, cost_matrix
 def remove_indices(lst, indices):
    """Remove indices from list in reverse order."""
    for index in sorted(indices, reverse=True):
        if index < len(lst):
            lst.pop(index)
    return lst
 def merge_blocks_by_list(blocks, merge_list):
    """Merge blocks according to merge list."""
    pop_list = []
    while merge_list:
        i = merge_list[0][0]
        j = merge_list[0][1]
        blocks[i] = merge_blocks_wo_check(blocks[i], blocks[j])
        pop_list.append(j)
        merge_list.pop(0)
        if merge_list:
            new_merge_list = []
            for k in range(len(merge_list)):
                if (
                    merge_list[k][0] != i
                    and merge_list[k][1] != i
                    and merge_list[k][0] != j
                    and merge_list[k][1] != j
                ):
                    new_merge_list.append(merge_list[k])
            merge_list = new_merge_list
    remove_indices(blocks, pop_list)
    return blocks
 def difference_of_means(list1, list2):
    """Calculate difference of means between two lists."""
    counter1 = Counter(list1)
    counter2 = Counter(list2)
    for element in set(list1) & set(list2):
        common_count = min(counter1[element], counter2[element])
        counter1[element] -= common_count
        counter2[element] -= common_count
    unique_list1 = [item for item in counter1.elements()]
    unique_list2 = [item for item in counter2.elements()]
    mean_list1 = sum(unique_list1) / len(unique_list1) if unique_list1 else 0
    mean_list2 = sum(unique_list2) / len(unique_list2) if unique_list2 else 0
    if mean_list1 - mean_list2 > 0:
        if min(unique_list1) > min(unique_list2):
            return mean_list1 - mean_list2
        return 0.0
    return mean_list1 - mean_list2
 def find_possible_merge(A, B, consecutive_bonus, window_size, debug=False):
    """Find possible merges between blocks."""
    merge_bonus = 0.0
    merge_windows = 1
    def sortFn(value):
        return value[2]
    while True:
        A_changed = False
        B_changed = False
        matching, current_cost, cost_matrix = find_maximum_matching(
            A, B, merge_bonus, merge_windows
        )
        if len(A) >= 2:
            merge_list = []
            for i in range(len(A) - 1):
                new_A = deepcopy(A)
                new_A[i] = merge_blocks_wo_check(new_A[i], new_A[i + 1])
                new_A.pop(i + 1)
                updated_matching, updated_cost, _ = find_maximum_matching(
                    new_A, B, merge_bonus, merge_windows
                )
                diff = difference_of_means(current_cost, updated_cost)
                if diff > 0.05:
                    merge_list.append([i, i + 1, diff])
            merge_list.sort(key=sortFn, reverse=True)
            if merge_list:
                A_changed = True
                A = merge_blocks_by_list(A, merge_list)
                matching, current_cost, cost_matrix = find_maximum_matching(
                    A, B, merge_bonus, merge_windows
                )
        if len(B) >= 2:
            merge_list = []
            for i in range(len(B) - 1):
                new_B = deepcopy(B)
                new_B[i] = merge_blocks_wo_check(new_B[i], new_B[i + 1])
                new_B.pop(i + 1)
                updated_matching, updated_cost, _ = find_maximum_matching(
                    A, new_B, merge_bonus, merge_windows
                )
                diff = difference_of_means(current_cost, updated_cost)
                if diff > 0.05:
                    merge_list.append([i, i + 1, diff])
            merge_list.sort(key=sortFn, reverse=True)
            if merge_list:
                B_changed = True
                B = merge_blocks_by_list(B, merge_list)
                matching, current_cost, cost_matrix = find_maximum_matching(
                    A, B, merge_bonus, merge_windows
                )
        if not A_changed and not B_changed:
            break
    matching, _, _ = find_maximum_matching(A, B, consecutive_bonus, window_size)
    return A, B, matching
 def merge_blocks_by_bbox(blocks):
    """Merge blocks with same bounding box."""
    merged_blocks = {}
    for block in blocks:
        bbox = tuple(block['bbox'])
        if bbox in merged_blocks:
            existing_block = merged_blocks[bbox]
            existing_block['text'] += ' ' + block['text']
            existing_block['color'] = [
                (ec + c) / 2 for ec, c in zip(existing_block['color'], block['color'])
            ]
        else:
            merged_blocks[bbox] = block
    return list(merged_blocks.values())
 def mask_bounding_boxes_with_inpainting(image, bounding_boxes):
    """Mask bounding boxes in image using inpainting."""
    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    mask = np.zeros(image_cv.shape[:2], dtype=np.uint8)
    height, width = image_cv.shape[:2]
    for bbox in bounding_boxes:
        x_ratio, y_ratio, w_ratio, h_ratio = bbox
        x = int(x_ratio * width)
        y = int(y_ratio * height)
        w = int(w_ratio * width)
        h = int(h_ratio * height)
        mask[y : y + h, x : x + w] = 255
    inpainted_image = cv2.inpaint(image_cv, mask, 3, cv2.INPAINT_TELEA)
    return Image.fromarray(cv2.cvtColor(inpainted_image, cv2.COLOR_BGR2RGB))
 def rescale_and_mask(image, blocks):
    """Rescale image and mask blocks."""
    if blocks:
        image = mask_bounding_boxes_with_inpainting(image, blocks)
    width, height = image.size
    if width < height:
        new_size = (width, width)
    else:
        new_size = (height, height)
    return image.resize(new_size, Image.LANCZOS)
 def calculate_clip_similarity(image1, image2, blocks1, blocks2):
    """Calculate CLIP similarity between two images."""
    model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
    processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    # Mask and preprocess images
    image1_masked = rescale_and_mask(image1, [block['bbox'] for block in blocks1])
    image2_masked = rescale_and_mask(image2, [block['bbox'] for block in blocks2])
    inputs = processor(
        images=[image1_masked, image2_masked], return_tensors='pt', padding=True
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}
    # Calculate features and similarity
    with torch.no_grad():
        image_features = model.get_image_features(**inputs)
        image_features1 = image_features[0].unsqueeze(0)
        image_features2 = image_features[1].unsqueeze(0)
        image_features1 /= image_features1.norm(dim=-1, keepdim=True)
        image_features2 /= image_features2.norm(dim=-1, keepdim=True)
        similarity = (image_features1 @ image_features2.T).item()
    return similarity
 def rgb_to_hex(rgb):
    """Convert an RGB tuple to hexadecimal format."""
    return '{:02X}{:02X}{:02X}'.format(*rgb)
 class ColorPool:
    def __init__(self, offset=0):
        color_values = list(range(10, 251, 16))
        color_list = [((r + offset) % 256, (g + offset) % 256, (b + offset) % 256) 
                     for r in color_values for g in color_values for b in color_values]
        self.color_pool = [rgb_to_hex(color) for color in color_list]
    def pop_color(self):
        if self.color_pool:
            return self.color_pool.pop()
        else:
            raise NotImplementedError
 def process_html_str(html_str, offset=0):
    """Process HTML string to assign unique colors to text elements."""
    soup = BeautifulSoup(html_str, 'html.parser')
    def update_style(element, property_name, value):
        important_value = f"{value} !important"
        styles = element.attrs.get('style', '').split(';')
        updated_styles = [s for s in styles if not s.strip().startswith(property_name) and len(s.strip()) > 0]
        updated_styles.append(f"{property_name}: {important_value}")
        element['style'] = '; '.join(updated_styles).strip()
    # Set background color of all elements to transparent white
    for element in soup.find_all(True):
        update_style(element, 'background-color', 'rgba(255, 255, 255, 0.0)')
    color_pool = ColorPool(offset)
    text_tags = ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'div', 'span', 'a', 'b', 'li', 
                 'table', 'td', 'th', 'button', 'footer', 'header', 'figcaption']
    for tag in soup.find_all(text_tags):
        color = f"#{color_pool.pop_color()}"
        update_style(tag, 'color', color)
        update_style(tag, 'opacity', '1.0')
    return str(soup)
 def similar(n1, n2):
    """Check if two numbers are similar within a threshold."""
    return abs(n1 - n2) <= 8
 def find_different_pixels(image1, image2):
    """Find pixels that differ between two images."""
    if image1.size != image2.size:
        logger.warning("Images are not the same size")
        return None
    image1 = image1.convert('RGB')
    image2 = image2.convert('RGB')
    pixels1 = image1.load()
    pixels2 = image2.load()
    different_pixels = []
    for x in range(image1.size[0]):
        for y in range(image1.size[1]):
            r1, g1, b1 = pixels1[x, y]
            r2, g2, b2 = pixels2[x, y]
            if similar((r1 + 50) % 256, r2) and similar((g1 + 50) % 256, g2) and similar((b1 + 50) % 256, b2):
                different_pixels.append((y, x))
    return np.stack(different_pixels) if different_pixels else None
 def extract_text_with_color(html_str):
    """Extract text and color information from HTML string."""
    def get_color(tag):
        if 'style' in tag.attrs:
            styles = tag['style'].split(';')
            color_style = [s for s in styles if 'color' in s and 'background-color' not in s]
            if color_style:
                color = color_style[-1].split(':')[1].strip().replace(" !important", "")
                if color[0] == "#":
                    return color
                else:
                    try:
                        if color.startswith('rgb'):
                            color = tuple(map(int, color[4:-1].split(',')))
                        else:
                            color = ImageColor.getrgb(color)
                        return '#{:02x}{:02x}{:02x}'.format(*color)
                    except ValueError:
                        logger.warning(f"Unable to identify or convert color: {color}")
                        return None
        return None
    def extract_text_recursive(element, parent_color='#000000'):
        if isinstance(element, Comment):
            return None
        elif isinstance(element, NavigableString):
            text = element.strip()
            return (text, parent_color) if text else None
        elif isinstance(element, Tag):
            current_color = get_color(element) or parent_color
            children_texts = filter(None, [extract_text_recursive(child, current_color) 
                                        for child in element.children])
            return list(children_texts)
    soup = BeautifulSoup(html_str, 'html.parser')
    body = soup.body
    return extract_text_recursive(body) if body else []
 def flatten_tree(tree):
    """Flatten a nested tree structure into a list."""
    flat_list = []
    def flatten(node):
        if isinstance(node, list):
            for item in node:
                flatten(item)
        else:
            flat_list.append(node)
    flatten(tree)
    return flat_list
 def get_blocks_from_image_diff_pixels(image, html_text_color_tree, different_pixels):
    """Extract text blocks from image using color differences."""
    image_cv = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
    x_w = image_cv.shape[0]
    y_w = image_cv.shape[1]
    def hex_to_bgr(hex_color):
        hex_color = hex_color.lstrip('#')
        rgb = tuple(int(hex_color[i:i+2], 16) for i in (0, 2, 4))
        return rgb[::-1]
    def get_intersect(arr1, arr2):
        arr1_reshaped = arr1.view([('', arr1.dtype)] * arr1.shape[1])
        arr2_reshaped = arr2.view([('', arr2.dtype)] * arr2.shape[1])
        common_rows = np.intersect1d(arr1_reshaped, arr2_reshaped)
        return common_rows.view(arr1.dtype).reshape(-1, arr1.shape[1])
    blocks = []
    for item in html_text_color_tree:
        try:
            color = np.array(hex_to_bgr(item[1]), dtype="uint8")
        except:
            continue
        lower = color - 4
        upper = color + 4
        mask = cv2.inRange(image_cv, lower, upper)
        coords = np.column_stack(np.where(mask > 0))
        coords = get_intersect(coords, different_pixels)
        if coords.size == 0:
            continue
        x_min, y_min = np.min(coords, axis=0)
        x_max, y_max = np.max(coords, axis=0)
        # Get average color from original image
        color_coords = coords.copy()
        color_coords = color_coords[color_coords[:, 0] <= x_max]
        color_coords = color_coords[color_coords[:, 1] <= y_max]
        colors = [image_cv[x, y] for x, y in color_coords]
        avg_color = tuple(map(int, np.mean(colors, axis=0)))[::-1]  # Convert BGR to RGB
        blocks.append({
            'text': item[0].lower(),
            'bbox': (y_min / y_w, x_min / x_w, (y_max - y_min + 1) / y_w, (x_max - x_min + 1) / x_w),
            'color': avg_color
        })
    return blocks
 def get_blocks_from_html(html_str, image1):
    """Extract text blocks from HTML and image."""
    # Process HTML with two different color offsets
    html_str_1 = process_html_str(html_str, offset=0)
    html_str_2 = process_html_str(html_str, offset=50)
    # Render both HTML versions to images
    # TODO: Screenshot html_str_2
    filter_color = (255, 0, 0)  
    image2 = Image.new("RGB", image1.size, filter_color)
    # Find pixels that differ between the two rendered images
    different_pixels = find_different_pixels(image1, image2)
    if different_pixels is None:
        logger.warning("Unable to get pixels with different colors")
        return []
    # Extract text and color information from HTML
    html_text_color_tree = flatten_tree(extract_text_with_color(html_str_1))
    try:
        blocks = get_blocks_from_image_diff_pixels(image1, html_text_color_tree, different_pixels)
    except Exception as e:
        logger.warning(f"Unable to get blocks: {e}")
        return []
    return blocks
 def evaluate(task, generated_img):
    """Evaluate generated image against reference image using multiple metrics."""
    # Load reference image
    post_image = task['post_image']
    # Extract blocks from HTML and images
    post_blocks = get_blocks_from_html(task['post_html'], post_image)
    gen_blocks = get_blocks_from_html(task['gen_html'], generated_img)
    print("block details", post_blocks, gen_blocks)
    if not post_blocks or not gen_blocks:
        # Fallback to basic CLIP and pixel comparison if no blocks available
        clip_score = calculate_clip_similarity(post_image, generated_img, [], [])
        logger.info(f'CLIP similarity score: {clip_score}')
        # Pixel comparison
        diff = ImageChops.difference(generated_img, post_image)
        pixel_match = not diff.getbbox()
        logger.info(
            f"Pixel difference analysis: {'No difference' if pixel_match else 'Differences found'}"
        )
        return clip_score > 0.95 or pixel_match
    # Merge blocks with same bounding boxes
    post_blocks = merge_blocks_by_bbox(post_blocks)
    gen_blocks = merge_blocks_by_bbox(gen_blocks)
    # Find optimal block matching
    consecutive_bonus, window_size = 0.1, 1
    gen_blocks_m, post_blocks_m, matching = find_possible_merge(
        gen_blocks, deepcopy(post_blocks), consecutive_bonus, window_size
    )
    # Filter matches with low similarity
    filtered_matching = []
    for i, j in matching:
        text_similarity = calculate_similarity(gen_blocks_m[i], post_blocks_m[j])
        if text_similarity >= 0.5:
            filtered_matching.append([i, j, text_similarity])
    matching = filtered_matching
    if not matching:
        logger.warning('No matching blocks found')
        clip_score = calculate_clip_similarity(
            post_image, generated_img, gen_blocks, post_blocks
        )
        return clip_score > 0.95
    # Calculate metrics for matched blocks
    indices1 = [item[0] for item in matching]
    indices2 = [item[1] for item in matching]
    # Calculate unmatched areas
    unmatched_area_1 = sum(
        block['bbox'][2] * block['bbox'][3]
        for i, block in enumerate(gen_blocks_m)
        if i not in indices1
    )
    unmatched_area_2 = sum(
        block['bbox'][2] * block['bbox'][3]
        for j, block in enumerate(post_blocks_m)
        if j not in indices2
    )
    total_unmatched_area = unmatched_area_1 + unmatched_area_2
    # Calculate metrics for matched blocks
    matched_areas = []
    text_scores = []
    position_scores = []
    color_scores = []
    for i, j, text_similarity in matching:
        # Area
        block_area = (
            gen_blocks_m[i]['bbox'][2] * gen_blocks_m[i]['bbox'][3]
            + post_blocks_m[j]['bbox'][2] * post_blocks_m[j]['bbox'][3]
        )
        matched_areas.append(block_area)
        # Position similarity
        position_similarity = 1 - calculate_distance_max_1d(
            gen_blocks_m[i]['bbox'][0] + gen_blocks_m[i]['bbox'][2] / 2,
            gen_blocks_m[i]['bbox'][1] + gen_blocks_m[i]['bbox'][3] / 2,
            post_blocks_m[j]['bbox'][0] + post_blocks_m[j]['bbox'][2] / 2,
            post_blocks_m[j]['bbox'][1] + post_blocks_m[j]['bbox'][3] / 2,
        )
        # Color similarity
        color_similarity = color_similarity_ciede2000(
            gen_blocks_m[i]['color'], post_blocks_m[j]['color']
        )
        text_scores.append(text_similarity)
        position_scores.append(position_similarity)
        color_scores.append(color_similarity)
    # Calculate final scores
    total_area = sum(matched_areas) + total_unmatched_area
    size_score = sum(matched_areas) / total_area if total_area > 0 else 0
    text_score = np.mean(text_scores) if text_scores else 0
    position_score = np.mean(position_scores) if position_scores else 0
    color_score = np.mean(color_scores) if color_scores else 0
    clip_score = calculate_clip_similarity(
        post_image, generated_img, gen_blocks, post_blocks
    )
    # Combine scores with equal weights
    final_score = 0.2 * (
        size_score + text_score + position_score + color_score + clip_score
    )
    logger.info('Evaluation scores:')
    logger.info(f'- Size score: {size_score:.3f}')
    logger.info(f'- Text score: {text_score:.3f}')
    logger.info(f'- Position score: {position_score:.3f}')
    logger.info(f'- Color score: {color_score:.3f}')
    logger.info(f'- CLIP score: {clip_score:.3f}')
    logger.info(f'- Final score: {final_score:.3f}')
    return final_score > 0.8  # Consider it a match if final score > 80%
 def png_to_bytes(png):
    buffer = BytesIO()
    png.save(buffer, format='PNG')
    image_bytes = buffer.getvalue()
    return image_bytes
 def bytes_to_image(image_bytes):
    """Convert bytes to a Pillow Image object."""
    return Image.open(BytesIO(image_bytes))
 if __name__ == '__main__':
    first_image = Image.open('./evaluation/visualcodebench/data/1/post.png')
    image = Image.open('./evaluation/visualcodebench/data/1/prev.png')
    html_file = open('./evaluation/visualcodebench/data/1/post/index.html', 'r')
    first_html = html_file.read()
    html_file.close()
    html_file = open('./evaluation/visualcodebench/data/1/prev/index.html', 'r')
    gen_html = html_file.read()
    html_file.close()
    sample = {'post_image': first_image, "post_html": first_html, "gen_html": gen_html}
    evaluate(sample, image)
--- a/evaluation/benchmarks/visualcodebench/prepare.py
+++ b/evaluation/benchmarks/visualcodebench/prepare.py
@@ -0,0 +1,97 @@
 import base64
 import os
 from io import BytesIO
 import pandas as pd
 from huggingface_hub import snapshot_download
 from PIL import PngImagePlugin
 from tqdm import tqdm
 from openhands.core.logger import openhands_logger as logger
 REPO_DOWNLOAD_DIR = (
    './evaluation/visualcodebench/'  # Directory to store the downloaded repository
 )
 def download_repository():
    """
    Download the entire repository from Hugging Face Hub.
    This function clones the repository into REPO_DOWNLOAD_DIR.
    """
    repo_id = 'rvmalhot/VisualCodeBench'
    try:
        logger.info(f"Downloading repository '{repo_id}'...")
        snapshot_download(
            repo_id=repo_id,
            local_dir=REPO_DOWNLOAD_DIR,
            repo_type='dataset',
            ignore_patterns=None,  # Download all files
        )
        logger.info(f"Repository downloaded to '{REPO_DOWNLOAD_DIR}'.")
    except Exception as e:
        logger.error(f"Error downloading repository '{repo_id}': {e}")
        raise e
 def format_task_dict(example):
    instance_id = example['id']
    prev_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/prev')
    post_remote_path = os.path.join(REPO_DOWNLOAD_DIR, f'data/{instance_id}/post')
    # Check if 'prev' and 'post' directories exist
    prev_exists = os.path.exists(prev_remote_path)
    post_exists = os.path.exists(post_remote_path)
    if prev_exists and post_exists:
        skip = False
    else:
        skip = True
    task = {
        'instance_id': instance_id,
        'prev_image': example['prev_image'],
        'post_image': example['post_image'],
        'changes': example['changes'],
        'prev_code_files': example['prev_code_files'],
        'post_code_files': example['post_code_files'],
        'skip': skip,
    }
    return task
 def prepare_visualcodebench(dataset):
    logger.info('Processing dataset')
    dataset_processed = []
    for example in tqdm(dataset['train']):
        formatted_example = format_task_dict(example)
        if formatted_example['skip']:
            continue
        del formatted_example['skip']
        dataset_processed.append(formatted_example)
    return pd.DataFrame(dataset_processed)
 def pil_image_to_base64(image: PngImagePlugin.PngImageFile) -> str:
    """
    Converts a PIL image to a Base64-encoded string.
    Parameters:
    - image (PngImagePlugin.PngImageFile): The PIL image to convert.
    Returns:
    - str: The Base64-encoded string of the image.
    """
    if not isinstance(image, PngImagePlugin.PngImageFile):
        raise ValueError(
            'The provided image is not a PIL.PngImagePlugin.PngImageFile instance.'
        )
    buffered = BytesIO()
    image.save(buffered, format='PNG')
    img_bytes = buffered.getvalue()
    img_base64 = base64.b64encode(img_bytes).decode('utf-8')
    base64_with_prefix = f'data:image/png;base64,{img_base64}'
    return [base64_with_prefix]
--- a/evaluation/benchmarks/visualcodebench/run_infer.py
+++ b/evaluation/benchmarks/visualcodebench/run_infer.py
@@ -0,0 +1,247 @@
 # FILE: run_infer.py
 import asyncio
 import os
 import shutil
 import tempfile
 from functools import partial
 import pandas as pd
 from datasets import load_dataset
 # from evaluation.benchmarks.visualcodebench.eval import capture_screenshot
 from evaluation.benchmarks.visualcodebench.prepare import (
    REPO_DOWNLOAD_DIR,
    download_repository,
    pil_image_to_base64,
    prepare_visualcodebench,
 )
 from evaluation.utils.shared import (
    EvalMetadata,
    assert_and_raise,
    codeact_user_response,
    make_metadata,
    prepare_dataset,
    reset_logger_for_multiprocessing,
    run_evaluation,
 )
 from openhands.controller.state.state import State
 from openhands.core.config import (
    AppConfig,
    SandboxConfig,
    get_llm_config_arg,
 )
 from openhands.core.config.utils import parse_arguments
 from openhands.core.logger import openhands_logger as logger  # Import OpenHands logger
 from openhands.core.main import create_runtime, run_controller
 from openhands.events.action.commands import CmdRunAction
 from openhands.events.action.message import MessageAction
 from openhands.events.observation.commands import CmdOutputObservation
 from openhands.runtime.base import Runtime
 from openhands.utils.async_utils import call_async_from_sync
 # Define workspace and output directories
 WORKSPACE_DIR = './workspace'
 FAKE_RESPONSES = {
    'CodeActAgent': partial(codeact_user_response, encapsulate_solution=True),
 }
 def get_config(
    metadata: EvalMetadata,
 ) -> AppConfig:
    config = AppConfig(
        default_agent=metadata.agent_class,
        run_as_openhands=False,
        runtime='eventstream',
        max_iterations=metadata.max_iterations,
        sandbox=SandboxConfig(
            base_container_image='python:3.12-bookworm',
            enable_auto_lint=True,
            use_host_network=False,
        ),
        # do not mount workspace
        workspace_base=None,
        workspace_mount_path=None,
    )
    config.set_llm_config(metadata.llm_config)
    return config
 def initialize_runtime(
    runtime: Runtime,
    instance: pd.Series,  # this argument is not required
 ):
    """Initialize the runtime for the agent.
    This function is called before the runtime is used to run the agent.
    """
    logger.info('-' * 30)
    logger.info('BEGIN Runtime Initialization Fn')
    logger.info('-' * 30)
    workspace_dir_name = instance['instance_id']
    obs: CmdOutputObservation
    action = CmdRunAction(command='mkdir -p /workspace/{workspace_dir_name}')
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        obs.exit_code == 0,
        f'Failed to create /workspace/{workspace_dir_name}: {str(obs)}',
    )
    file_path = REPO_DOWNLOAD_DIR + f'data/{workspace_dir_name}/prev/index.html'
    runtime.copy_to(file_path, f'/workspace/{workspace_dir_name}')
    logger.info(f'Copied code file for instance {workspace_dir_name}')
    action = CmdRunAction(command=f'cd /workspace/{workspace_dir_name}')
    action.timeout = 600
    logger.info(action, extra={'msg_type': 'ACTION'})
    obs = runtime.run_action(action)
    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
    assert_and_raise(
        obs.exit_code == 0,
        f'Failed to cd to /workspace/{workspace_dir_name}: {str(obs)}',
    )
    logger.info('-' * 30)
    logger.info('END Runtime Initialization Fn')
    logger.info('-' * 30)
 def complete_runtime(
    runtime: Runtime,
    instance: pd.Series,  # this argument is not required, but it is used to get the workspace_dir_name
 ) -> str:
    # TODO: extract edited HTML file from agent workspace
    # temp_zip = runtime.copy_from(f'/workspace/{instance.instance_id}')
    # file_name = f'/workspace/{instance.instance_id}/index.html'
    # with zipfile.ZipFile(temp_zip, 'r') as zip_ref:
    #     if file_name in zip_ref.namelist():
    #         with zip_ref.open(file_name) as file:
    #             file_content = file.read().decode('utf-8')  # Decode bytes to string
    #     else:
    #         raise FileNotFoundError(f"'{file_name}' not found in the ZIP archive.")
    with tempfile.TemporaryDirectory() as tmpdir:
        src_folder = REPO_DOWNLOAD_DIR + f'data/{instance.instance_id}/post/'
        shutil.copytree(src_folder, tmpdir, dirs_exist_ok=True)
        # image = capture_screenshot(tmpdir)
        # if image is not None:
        #     shutil.copy(os.path.join(tmpdir, 'final_screenshot.png'), REPO_DOWNLOAD_DIR)
 def process_instance(
    instance: pd.Series, metadata: EvalMetadata, reset_logger: bool = True
 ):
    config = get_config(metadata)
    # Setup the logger properly, so you can run multi-processing to parallelize the evaluation
    if reset_logger:
        log_dir = os.path.join(metadata.eval_output_dir, 'infer_logs')
        reset_logger_for_multiprocessing(logger, instance.instance_id, log_dir)
    else:
        logger.info(f'Starting evaluation for instance {instance.instance_id}.')
    # =============================================
    # build instruction
    # =============================================
    # Prepare instruction
    instruction = (
        f"Modify the HTML/CSS according to the following instruction:\n\n"
        f"{instance['changes']}\n\n"
    )
    instruction += (
        'IMPORTANT: You should ONLY interact with the environment provided '
        'to you AND NEVER ASK FOR HUMAN HELP.\n'
    )
    # =============================================
    # create sandbox and run the agent
    # =============================================
    runtime: Runtime = create_runtime(config)
    call_async_from_sync(runtime.connect)
    try:
        initialize_runtime(runtime, instance=instance)
        image_urls = pil_image_to_base64(instance['prev_image'])
        action = MessageAction(content=instruction, image_urls=image_urls)
        state: State | None = asyncio.run(
            run_controller(
                config=config,
                initial_user_action=action,
                runtime=runtime,
                fake_user_response_fn=FAKE_RESPONSES[metadata.agent_class],
            )
        )
        if state is None:
            raise ValueError('State should not be None.')
        # =============================================
        # result evaluation
        # =============================================
        return_val = complete_runtime(runtime, instance)
        logger.info(f'Return value {return_val}')
    finally:
        runtime.close()
    # TODO: return EVAL output
 def main():
    """Main function to run the evaluation."""
    # args = parse_args()
    args = parse_arguments()
    logger.info(f"\n{'='*80}\nStarting VisualCodeBench Evaluation\n{'='*80}")
    logger.info(f'Agent: {args.agent_cls}')
    logger.info(f'Model: {args.llm_config}')
    logger.info(f'Max iterations: {args.max_iterations}')
    logger.info(f'Eval limit: {args.eval_n_limit}')
    logger.info(f'Num workers: {args.eval_num_workers}\n')
    logger.info(f'Eval output: {args.eval_output_dir}\n')
    # Step 1: Download the entire repository once
    logger.info('Downloading repository...')
    download_repository()
    # Step 2: Load Dataset
    logger.info('Loading dataset...')
    dataset = load_dataset(REPO_DOWNLOAD_DIR)
    # Step 3: Prepare dataset
    llm_config = get_llm_config_arg(args.llm_config)
    if llm_config is None:
        logger.error(f'Could not find LLM config: {args.llm_config}')
        raise ValueError(f'Could not find LLM config: {args.llm_config}')
    metadata = make_metadata(
        llm_config,
        'VisualCodeBench',
        args.agent_cls,
        args.max_iterations,
        args.eval_note,
        'evaluation/output/',
    )
    output_file = os.path.join(metadata.eval_output_dir, 'output.jsonl')
    dataset = prepare_visualcodebench(dataset)
    instances = prepare_dataset(dataset, output_file, eval_n_limit=args.eval_n_limit)
    # Step 4: Run eval
    run_evaluation(
        instances, metadata, output_file, args.eval_num_workers, process_instance
    )
 if __name__ == '__main__':
    main()
--- a/evaluation/benchmarks/visualcodebench/scripts/run_infer.sh
+++ b/evaluation/benchmarks/visualcodebench/scripts/run_infer.sh
@@ -0,0 +1,46 @@
 #!/bin/bash
 set -eo pipefail
 source "evaluation/utils/version_control.sh"
 # Check if required arguments are provided
 if [ "$#" -lt 4 ]; then
    echo "Usage: $0 [model_config] [commit_hash] [agent_cls] [eval_limit] [num_workers]"
    echo "Example: $0 llm.eval_gpt_4o_mini HEAD CodeActAgent 5 1"
    exit 1
 fi
 MODEL_CONFIG=$1
 COMMIT_HASH=$2
 AGENT_CLS=$3
 EVAL_LIMIT=$4
 NUM_WORKERS=${5:-1}  # Default to 1 worker if not specified
 # Checkout the specified commit
 checkout_eval_branch
 if [ -z "$AGENT" ]; then
  echo "Agent not specified, use default CodeActAgent"
  AGENT="CodeActAgent"
 fi
 get_openhands_version
 echo "AGENT: $AGENT"
 echo "OPENHANDS_VERSION: $OPENHANDS_VERSION"
 echo "MODEL_CONFIG: $MODEL_CONFIG"
 COMMAND="export PYTHONPATH=evaluation/benchmarks/visualcodebench:\$PYTHONPATH && poetry run python evaluation/benchmarks/visualcodebench/run_infer.py \
  --agent-cls $AGENT \
  --llm-config $MODEL_CONFIG \
  --max-iterations 5 \
  --eval-num-workers $NUM_WORKERS \
  --eval-note $OPENHANDS_VERSION" \
 if [ -n "$EVAL_LIMIT" ]; then
  echo "EVAL_LIMIT: $EVAL_LIMIT"
  COMMAND="$COMMAND --eval-n-limit $EVAL_LIMIT"
 fi
 # Run the command
 eval $COMMAND
--- a/evaluation/benchmarks/visualcodebench/server.py
+++ b/evaluation/benchmarks/visualcodebench/server.py
@@ -0,0 +1,167 @@
 import http
 import os
 import socket
 import socketserver
 import threading
 import time
 from io import BytesIO
 import requests
 from PIL import Image, ImageChops
 from playwright.sync_api import sync_playwright
 from openhands.core.logger import openhands_logger as logger
 def get_free_port():
    """Find a free port to run the HTTP server."""
    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
        s.bind(('', 0))
        return s.getsockname()[1]
 def start_http_server(tmpdir):
    port = get_free_port()
    class CustomHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
        def translate_path(self, path):
            # Serve files from the specified directory instead of the current working directory
            path = super().translate_path(path)
            relative_path = os.path.relpath(path, os.getcwd())
            return os.path.join(tmpdir, relative_path)
    handler = CustomHTTPRequestHandler
    server = socketserver.TCPServer(('', port), handler)
    return server, port
 def capture_screenshot(tmpdir):
    server, port = start_http_server(tmpdir)
    server_thread = threading.Thread(target=server.serve_forever)
    server_thread.daemon = True
    server_thread.start()
    time.sleep(10)
    image = None
    try:
        server_url = f'http://localhost:{port}/'
        if not is_server_reachable(server_url):
            raise RuntimeError(f'Server not reachable at {server_url}')
        screenshot_path = os.path.join(tmpdir, 'final_screenshot.png')
        capture_screenshot_playwright(server_url, screenshot_path)
        image = Image.open(screenshot_path)
        image.load()
    finally:
        # Shut down the server and clean up
        server.shutdown()
        server.server_close()
    return image
 def is_server_reachable(url):
    """
    Check if the local server is reachable.
    """
    try:
        response = requests.get(url, timeout=5)  # Set a 5-second timeout
        if response.status_code == 200:
            logger.info(f'Server is reachable at {url}')
            return True
        else:
            logger.warning(
                f'Server responded with status code {response.status_code} at {url}'
            )
            return False
    except requests.ConnectionError as e:
        logger.error(f'Failed to connect to server at {url}: {e}')
        return False
 def capture_screenshot_playwright(url, screenshot_path):
    """Capture a screenshot of the given URL using Playwright."""
    try:
        with sync_playwright() as p:
            logger.info('Launching browser...')
            browser = p.chromium.launch(timeout=10000)  # 10 seconds for browser launch
            logger.info('Creating a new page...')
            page = browser.new_page()
            logger.info(f'Navigating to URL: {url}')
            try:
                page.goto(url, timeout=60 * 1000)  # Set timeout to 5 seconds
                logger.info('Page navigation completed.')
            except Exception as e:
                logger.warning(f'Page navigation timed out. {e}. Continuing...')
            logger.info('Waiting for network to be idle...')
            try:
                page.wait_for_load_state(
                    'networkidle', timeout=60 * 1000
                )  # Set timeout to 5 seconds
                logger.info('Page load state reached.')
            except Exception as e:
                logger.warning(f'Page load state timed out. {e}. Continuing...')
            logger.info('Capturing screenshot...')
            page.screenshot(
                path=screenshot_path, full_page=True
            )  # Capture full page screenshot
            logger.info(f'Screenshot saved to {screenshot_path}')
            browser.close()
            return True
    except Exception as e:
        logger.error(f'Error capturing screenshot with Playwright: {e}')
        return False
 def evaluate(task, screenshot_path):
    """Compare generated screenshot with post_image using CLIP score."""
    try:
        import torch
        from transformers import CLIPModel, CLIPProcessor
        # Load CLIP model and processor
        model = CLIPModel.from_pretrained('openai/clip-vit-base-patch32')
        processor = CLIPProcessor.from_pretrained('openai/clip-vit-base-patch32')
        # Load images
        post_image = Image.open(BytesIO(task['post_image']))
        generated_img = Image.open(screenshot_path)
        # Process images
        inputs = processor(
            images=[post_image, generated_img], return_tensors='pt', padding=True
        )
        # Get image features
        image_features = model.get_image_features(**inputs)
        # Calculate cosine similarity
        similarity = torch.nn.functional.cosine_similarity(
            image_features[0].unsqueeze(0), image_features[1].unsqueeze(0)
        ).item()
        logger.info(f'CLIP similarity score: {similarity}')
        return similarity > 0.95  # Consider it a match if similarity > 95%
    except Exception as e:
        logger.error(f'Error in CLIP evaluation: {e}')
        # Fallback to pixel comparison if CLIP fails
        try:
            post_image = Image.open(BytesIO(task['post_image']))
            generated_img = Image.open(screenshot_path)
            # Compare images directly without converting to bytes
            diff = ImageChops.difference(generated_img, post_image)
            logger.info(
                f"Pixel difference analysis: {'No difference' if not diff.getbbox() else 'Differences found'}"
            )
            return not diff.getbbox()
        except Exception as ex:
            logger.error(f'Error in fallback evaluation: {ex}')
            return False
--- a/poetry.lock
+++ b/poetry.lock
@@ -1108,6 +1108,20 @@ humanfriendly = ">=9.1"
 [package.extras]
 cron = ["capturer (>=2.4)"]
 [[package]]
 name = "colormath"
 version = "3.0.0"
 description = "Color math and conversion library."
 optional = false
 python-versions = "*"
 files = [
    {file = "colormath-3.0.0.tar.gz", hash = "sha256:3d4605af344527da0e4f9f504fad7ddbebda35322c566a6c72e28edb1ff31217"},
 ]
 [package.dependencies]
 networkx = ">=2.0"
 numpy = "*"
 [[package]]
 name = "comm"
 version = "0.2.2"
Author	SHA1	Message	Date
rohitvinodmalhotra@gmail.com	532a284d5c	migrate to use OH version	2025-01-26 15:24:35 -05:00
rohitvinodmalhotra@gmail.com	43f6104967	Merge branch 'main' into eval/visualcodebench	2025-01-26 15:14:28 -05:00
openhands	e249b920ff	feat: adapt Design2Code block detection for in-memory evaluation	2024-11-30 19:28:22 +00:00
rohitvinodmalhotra@gmail.com	d920a69f69	adding back server code	2024-11-30 14:00:25 -05:00
openhands	a8ce888981	refactor: adapt Design2Code evaluation metrics	2024-11-30 17:17:05 +00:00
rohitvinodmalhotra@gmail.com	e22ddc0dd6	uncomment agent run	2024-11-26 17:00:07 -05:00
rohitvinodmalhotra@gmail.com	c370912f12	adding eval scripts	2024-11-26 16:57:19 -05:00