From d7d051200f9c0e22dfe216aac8fb5e9b67a07331 Mon Sep 17 00:00:00 2001
From: Alexander Eichhorn <alex@code-with.us>
Date: Sun, 28 Dec 2025 17:32:50 +0100
Subject: [PATCH] fix(z_image): use unrestricted image self-attention for
 regional prompting (#8718)

Changes image self-attention from restricted (region-isolated) to unrestricted
(all image tokens can attend to each other), similar to the FLUX approach.

This fixes the issue where ZImage-Turbo with multiple regional guidance layers
would generate two separate/disconnected images instead of compositing them
into a single unified image.

The regional text-image attention remains restricted so that each region still
responds to its corresponding prompt.

Fixes #8715
---
 .../regional_prompting_extension.py           | 22 +++++++++----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/invokeai/backend/z_image/extensions/regional_prompting_extension.py b/invokeai/backend/z_image/extensions/regional_prompting_extension.py
index 3bb11d5ead..26f91749f7 100644
--- a/invokeai/backend/z_image/extensions/regional_prompting_extension.py
+++ b/invokeai/backend/z_image/extensions/regional_prompting_extension.py
@@ -66,12 +66,16 @@ class ZImageRegionalPromptingExtension:
     ) -> torch.Tensor | None:
         """Prepare a regional attention mask for Z-Image.
 
-        The mask controls which tokens can attend to each other:
-        - Image tokens within a region attend only to each other
+        This uses an 'unrestricted' image self-attention approach (similar to FLUX):
+        - Image tokens can attend to ALL other image tokens (unrestricted self-attention)
         - Image tokens attend only to their corresponding regional text
         - Text tokens attend only to their corresponding regional image
         - Text tokens attend to themselves
 
+        The unrestricted image self-attention allows the model to maintain global
+        coherence across regions, preventing the generation of separate/disconnected
+        images for each region.
+
         Z-Image sequence order: [img_tokens, txt_tokens]
 
         Args:
@@ -129,12 +133,6 @@ class ZImageRegionalPromptingExtension:
                 # 3. txt attends to corresponding regional img
                 # Reshape mask to (1, img_seq_len) for broadcasting
                 regional_attention_mask[txt_start:txt_end, :img_seq_len] = mask_flat.view(1, img_seq_len)
-
-                # 4. img self-attention within region
-                # mask @ mask.T creates pairwise attention within the masked region
-                regional_attention_mask[:img_seq_len, :img_seq_len] += mask_flat.view(img_seq_len, 1) @ mask_flat.view(
-                    1, img_seq_len
-                )
             else:
                 # Global prompt: allow attention to/from background regions only
                 if background_region_mask is not None:
@@ -152,10 +150,10 @@ class ZImageRegionalPromptingExtension:
                     regional_attention_mask[:img_seq_len, txt_start:txt_end] = 1.0
                     regional_attention_mask[txt_start:txt_end, :img_seq_len] = 1.0
 
-        # Allow background regions to attend to themselves
-        if background_region_mask is not None:
-            bg_mask = background_region_mask.view(img_seq_len, 1)
-            regional_attention_mask[:img_seq_len, :img_seq_len] += bg_mask @ bg_mask.T
+        # 4. Allow unrestricted image self-attention
+        # This is the key difference from the restricted approach - all image tokens
+        # can attend to each other, which helps maintain global coherence across regions
+        regional_attention_mask[:img_seq_len, :img_seq_len] = 1.0
 
         # Convert to boolean mask
         regional_attention_mask = regional_attention_mask > 0.5