Merge branch 'main' into copilot/add-seed-variance-enhancer

2026-04-23 03:00:31 -04:00 · 2025-12-28 12:13:11 -05:00
parent 247130a32a d7d051200f
commit 54f7bd6373
1 changed files with 10 additions and 12 deletions
--- a/invokeai/backend/z_image/extensions/regional_prompting_extension.py
+++ b/invokeai/backend/z_image/extensions/regional_prompting_extension.py
@@ -66,12 +66,16 @@ class ZImageRegionalPromptingExtension:
    ) -> torch.Tensor | None:
        """Prepare a regional attention mask for Z-Image.

-        The mask controls which tokens can attend to each other:
-        - Image tokens within a region attend only to each other
+        This uses an 'unrestricted' image self-attention approach (similar to FLUX):
+        - Image tokens can attend to ALL other image tokens (unrestricted self-attention)
        - Image tokens attend only to their corresponding regional text
        - Text tokens attend only to their corresponding regional image
        - Text tokens attend to themselves

+        The unrestricted image self-attention allows the model to maintain global
+        coherence across regions, preventing the generation of separate/disconnected
+        images for each region.
+
        Z-Image sequence order: [img_tokens, txt_tokens]

        Args:
@@ -129,12 +133,6 @@ class ZImageRegionalPromptingExtension:
                # 3. txt attends to corresponding regional img
                # Reshape mask to (1, img_seq_len) for broadcasting
                regional_attention_mask[txt_start:txt_end, :img_seq_len] = mask_flat.view(1, img_seq_len)
-
-                # 4. img self-attention within region
-                # mask @ mask.T creates pairwise attention within the masked region
-                regional_attention_mask[:img_seq_len, :img_seq_len] += mask_flat.view(img_seq_len, 1) @ mask_flat.view(
-                    1, img_seq_len
-                )
            else:
                # Global prompt: allow attention to/from background regions only
                if background_region_mask is not None:
@@ -152,10 +150,10 @@ class ZImageRegionalPromptingExtension:
                    regional_attention_mask[:img_seq_len, txt_start:txt_end] = 1.0
                    regional_attention_mask[txt_start:txt_end, :img_seq_len] = 1.0

-        # Allow background regions to attend to themselves
-        if background_region_mask is not None:
-            bg_mask = background_region_mask.view(img_seq_len, 1)
-            regional_attention_mask[:img_seq_len, :img_seq_len] += bg_mask @ bg_mask.T
+        # 4. Allow unrestricted image self-attention
+        # This is the key difference from the restricted approach - all image tokens
+        # can attend to each other, which helps maintain global coherence across regions
+        regional_attention_mask[:img_seq_len, :img_seq_len] = 1.0

        # Convert to boolean mask
        regional_attention_mask = regional_attention_mask > 0.5