diff --git a/evaluation/benchmarks/swe_bench/resource/mapping.py b/evaluation/benchmarks/swe_bench/resource/mapping.py
index be6ddad621..d29d9a11c1 100644
--- a/evaluation/benchmarks/swe_bench/resource/mapping.py
+++ b/evaluation/benchmarks/swe_bench/resource/mapping.py
@@ -28,7 +28,7 @@ def get_resource_mapping(dataset_name: str) -> dict[str, float]:
 
         with open(file_path, 'r') as f:
             _global_resource_mapping[dataset_name] = json.load(f)
-        logger.info(f'Loaded resource mapping for {dataset_name}')
+        logger.debug(f'Loaded resource mapping for {dataset_name}')
     return _global_resource_mapping[dataset_name]
 
 
diff --git a/evaluation/benchmarks/swe_bench/run_infer.py b/evaluation/benchmarks/swe_bench/run_infer.py
index 23ce3a1cc9..57ab8ed20d 100644
--- a/evaluation/benchmarks/swe_bench/run_infer.py
+++ b/evaluation/benchmarks/swe_bench/run_infer.py
@@ -121,7 +121,7 @@ Be thorough in your exploration, testing, and reasoning. It's fine if your think
         )
 
     if 'image_assets' in instance:
-        assets = instance['image_assets']
+        assets = json.loads(instance['image_assets'])
         assert (
             'problem_statement' in assets
         ), 'problem_statement is required in image_assets'
@@ -146,8 +146,8 @@ def get_instance_docker_image(
         # swebench/sweb.eval.x86_64.django_1776_django-11333:v1
         docker_image_prefix = 'docker.io/swebench/'
         repo, name = instance_id.split('__')
-        image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'
-        logger.info(f'Using official SWE-Bench image: {image_name}')
+        image_name = f'swebench/sweb.eval.x86_64.{repo}_1776_{name}:latest'.lower()
+        logger.debug(f'Using official SWE-Bench image: {image_name}')
         return image_name
     else:
         # OpenHands version of the image
@@ -164,10 +164,7 @@ def get_config(
     metadata: EvalMetadata,
 ) -> AppConfig:
     # We use a different instance image for the each instance of swe-bench eval
-    use_swebench_official_image = bool(
-        ('verified' in metadata.dataset.lower() or 'lite' in metadata.dataset.lower())
-        and 'swe-gym' not in metadata.dataset.lower()
-    )
+    use_swebench_official_image = 'swe-gym' not in metadata.dataset.lower()
     base_container_image = get_instance_docker_image(
         instance['instance_id'],
         swebench_official_image=use_swebench_official_image,
@@ -334,15 +331,18 @@ def initialize_runtime(
     logger.info(obs, extra={'msg_type': 'OBSERVATION'})
     assert_and_raise(obs.exit_code == 0, f'Failed to remove git remotes: {str(obs)}')
 
-    action = CmdRunAction(command='which python')
-    action.set_hard_timeout(600)
-    logger.info(action, extra={'msg_type': 'ACTION'})
-    obs = runtime.run_action(action)
-    logger.info(obs, extra={'msg_type': 'OBSERVATION'})
-    assert_and_raise(
-        obs.exit_code == 0 and 'testbed' in obs.content,
-        f'Expected to find python interpreter from testbed, but got: {str(obs)}',
-    )
+    if 'multimodal' not in metadata.dataset.lower():
+        # Only for non-multimodal datasets, we need to activate the testbed environment for Python
+        # SWE-Bench multimodal datasets are not using the testbed environment
+        action = CmdRunAction(command='which python')
+        action.set_hard_timeout(600)
+        logger.info(action, extra={'msg_type': 'ACTION'})
+        obs = runtime.run_action(action)
+        logger.info(obs, extra={'msg_type': 'OBSERVATION'})
+        assert_and_raise(
+            obs.exit_code == 0 and 'testbed' in obs.content,
+            f'Expected to find python interpreter from testbed, but got: {str(obs)}',
+        )
 
     logger.info('-' * 30)
     logger.info('END Runtime Initialization Fn')
@@ -761,9 +761,19 @@ if __name__ == '__main__':
             with open(cur_output_file, 'r') as f:
                 for line in f:
                     instance = json.loads(line)
-                    history = [event_from_dict(event) for event in instance['history']]
-                    critic_result = critic.evaluate(history)
-                    if not critic_result.success:
+                    try:
+                        history = [
+                            event_from_dict(event) for event in instance['history']
+                        ]
+                        critic_result = critic.evaluate(
+                            history, instance['test_result'].get('git_patch', '')
+                        )
+                        if not critic_result.success:
+                            instances_failed.append(instance['instance_id'])
+                    except Exception as e:
+                        logger.error(
+                            f'Error loading history for instance {instance["instance_id"]}: {e}'
+                        )
                         instances_failed.append(instance['instance_id'])
             logger.info(
                 f'{len(instances_failed)} instances failed the current attempt {attempt}: {instances_failed}'
diff --git a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
index 6bacc93fde..61ca1e1510 100755
--- a/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
+++ b/evaluation/benchmarks/swe_bench/scripts/setup/instance_swe_entry.sh
@@ -18,6 +18,7 @@ if [[ -z "$item" ]]; then
   exit 1
 fi
 
+
 WORKSPACE_NAME=$(echo "$item" | jq -r '(.repo | tostring) + "__" + (.version | tostring) | gsub("/"; "__")')
 
 echo "WORKSPACE_NAME: $WORKSPACE_NAME"
@@ -36,5 +37,7 @@ mkdir -p /workspace
 cp -r /testbed /workspace/$WORKSPACE_NAME
 
 # Activate instance-specific environment
-. /opt/miniconda3/etc/profile.d/conda.sh
-conda activate testbed
+if [ -d /opt/miniconda3 ]; then
+    . /opt/miniconda3/etc/profile.d/conda.sh
+    conda activate testbed
+fi
diff --git a/openhands/critic/base.py b/openhands/critic/base.py
index 58a125850e..bfde3827d8 100644
--- a/openhands/critic/base.py
+++ b/openhands/critic/base.py
@@ -23,9 +23,11 @@ class CriticResult(BaseModel):
 
 class BaseCritic(abc.ABC):
     """
-    A critic is a function that takes in a list of events and returns a score about the quality of those events.
+    A critic is a function that takes in a list of events, optional git patch, and returns a score about the quality of those events.
     """
 
     @abc.abstractmethod
-    def evaluate(self, events: list[Event]) -> CriticResult:
+    def evaluate(
+        self, events: list[Event], git_patch: str | None = None
+    ) -> CriticResult:
         pass
diff --git a/openhands/critic/finish_critic.py b/openhands/critic/finish_critic.py
index 8da3bc2c06..ca8dcd872b 100644
--- a/openhands/critic/finish_critic.py
+++ b/openhands/critic/finish_critic.py
@@ -5,16 +5,21 @@ from openhands.events.action import Action, AgentFinishAction
 
 class AgentFinishedCritic(BaseCritic):
     """This is a simple rule-based critic that checks if the last event is an AgentFinishAction.
-
     If not, it will return a score of 0 and a message indicating that the agent did not finish.
+    If the git patch is provided and is empty, it will return a score of 0 and a message indicating that the git patch is empty.
     """
 
     def __init__(self):
         pass
 
-    def evaluate(self, events: list[Event]) -> CriticResult:
+    def evaluate(
+        self, events: list[Event], git_patch: str | None = None
+    ) -> CriticResult:
         last_action = next((h for h in reversed(events) if isinstance(h, Action)), None)
 
+        if git_patch is not None and len(git_patch.strip()) == 0:
+            return CriticResult(score=0, message='Git patch is empty.')
+
         if isinstance(last_action, AgentFinishAction):
             return CriticResult(score=1, message='Agent finished.')
         else: