Remove FLUX TrajectoryGuidanceExtension and revert to the InpaintExtension. Keep the improved inpaint gradient mask adjustment behaviour.

2026-04-23 03:00:31 -04:00 · 2024-09-25 15:14:11 +00:00
parent 8d50ecdfc3
commit bdeec54886
4 changed files with 37 additions and 94 deletions
--- a/invokeai/app/invocations/flux_denoise.py
+++ b/invokeai/app/invocations/flux_denoise.py
@@ -20,6 +20,7 @@ from invokeai.app.invocations.model import TransformerField
 from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.flux.denoise import denoise
+from invokeai.backend.flux.inpaint_extension import InpaintExtension
 from invokeai.backend.flux.model import Flux
 from invokeai.backend.flux.sampling_utils import (
    clip_timestep_schedule_fractional,
@@ -29,7 +30,6 @@ from invokeai.backend.flux.sampling_utils import (
    pack,
    unpack,
 )
-from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
 from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
 from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.model_manager.config import ModelFormat
@@ -181,12 +181,14 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
        # Now that we have 'packed' the latent tensors, verify that we calculated the image_seq_len correctly.
        assert image_seq_len == x.shape[1]

-        # Prepare trajectory guidance extension.
-        traj_guidance_extension: TrajectoryGuidanceExtension | None = None
-        if init_latents is not None:
-            traj_guidance_extension = TrajectoryGuidanceExtension(
+        # Prepare inpaint extension.
+        inpaint_extension: InpaintExtension | None = None
+        if inpaint_mask is not None:
+            assert init_latents is not None
+            inpaint_extension = InpaintExtension(
                init_latents=init_latents,
                inpaint_mask=inpaint_mask,
+                noise=noise,
            )

        with (
@@ -234,7 +236,7 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
                timesteps=timesteps,
                step_callback=self._build_step_callback(context),
                guidance=self.guidance,
-                traj_guidance_extension=traj_guidance_extension,
+                inpaint_extension=inpaint_extension,
            )

        x = unpack(x.float(), self.height, self.width)
--- a/invokeai/backend/flux/denoise.py
+++ b/invokeai/backend/flux/denoise.py
@@ -3,8 +3,8 @@ from typing import Callable
 import torch
 from tqdm import tqdm

+from invokeai.backend.flux.inpaint_extension import InpaintExtension
 from invokeai.backend.flux.model import Flux
-from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState


@@ -20,7 +20,7 @@ def denoise(
    timesteps: list[float],
    step_callback: Callable[[PipelineIntermediateState], None],
    guidance: float,
-    traj_guidance_extension: TrajectoryGuidanceExtension | None,  # noqa: F821
+    inpaint_extension: InpaintExtension | None,
 ):
    # step 0 is the initial state
    total_steps = len(timesteps) - 1
@@ -48,14 +48,13 @@ def denoise(
            guidance=guidance_vec,
        )

-        if traj_guidance_extension is not None:
-            pred = traj_guidance_extension.update_noise(
-                t_curr_latents=img, pred_noise=pred, t_curr=t_curr, t_prev=t_prev
-            )
-
        preview_img = img - t_curr * pred
        img = img + (t_prev - t_curr) * pred

+        if inpaint_extension is not None:
+            img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
+            preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
+
        step_callback(
            PipelineIntermediateState(
                step=step,
--- a/invokeai/backend/flux/inpaint_extension.py
+++ b/invokeai/backend/flux/inpaint_extension.py
@@ -19,8 +19,26 @@ class InpaintExtension:
        self._inpaint_mask = inpaint_mask
        self._noise = noise

+    def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
+        """Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
+        # As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
+        # 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
+        # number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
+        # threshold worked well.
+        # We use a small epsilon to avoid any potential issues with floating point precision.
+        eps = 1e-4
+        mask_gradient_t_cutoff = 0.5
+        if t_prev > mask_gradient_t_cutoff:
+            # Early in the denoising process, use the inpaint mask as-is.
+            return self._inpaint_mask
+        else:
+            # After the cut-off, promote all non-zero mask values to 1.0.
+            mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
+
+        return mask
+
    def merge_intermediate_latents_with_init_latents(
-        self, intermediate_latents: torch.Tensor, timestep: float
+        self, intermediate_latents: torch.Tensor, t_prev: float
    ) -> torch.Tensor:
        """Merge the intermediate latents with the initial latents for the current timestep using the inpaint mask. I.e.
        update the intermediate latents to keep the regions that are not being inpainted on the correct noise
@@ -28,8 +46,10 @@ class InpaintExtension:

        This function should be called after each denoising step.
        """
+        mask = self._apply_mask_gradient_adjustment(t_prev)
+
        # Noise the init latents for the current timestep.
-        noised_init_latents = self._noise * timestep + (1.0 - timestep) * self._init_latents
+        noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents

        # Merge the intermediate latents with the noised_init_latents using the inpaint_mask.
-        return intermediate_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
+        return intermediate_latents * mask + noised_init_latents * (1.0 - mask)
--- a/invokeai/backend/flux/trajectory_guidance_extension.py
+++ b/invokeai/backend/flux/trajectory_guidance_extension.py
@@ -1,78 +0,0 @@
-import torch
-
-
-class TrajectoryGuidanceExtension:
-    """An implementation of trajectory guidance for FLUX."""
-
-    def __init__(self, init_latents: torch.Tensor, inpaint_mask: torch.Tensor | None):
-        """Initialize TrajectoryGuidanceExtension.
-
-        Args:
-            init_latents (torch.Tensor): The initial latents (i.e. un-noised at timestep 0). In 'packed' format.
-            inpaint_mask (torch.Tensor | None): A mask specifying which elements to inpaint. Range [0, 1]. Values of 1
-                will be re-generated. Values of 0 will remain unchanged. Values between 0 and 1 can be used to blend the
-                inpainted region with the background. In 'packed' format. If None, will be treated as a mask of all 1s.
-        """
-        self._init_latents = init_latents
-        if inpaint_mask is None:
-            # The inpaing mask is None, so we initialize a mask with a single value of 1.0.
-            # This value will be broadcasted and treated as a mask of all 1s.
-            self._inpaint_mask = torch.ones(1, device=init_latents.device, dtype=init_latents.dtype)
-        else:
-            self._inpaint_mask = inpaint_mask
-
-    def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
-        """Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
-        # As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
-        # 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
-        # number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
-        # threshold worked well.
-        # We use a small epsilon to avoid any potential issues with floating point precision.
-        eps = 1e-4
-        mask_gradient_t_cutoff = 0.5
-        if t_prev > mask_gradient_t_cutoff:
-            # Early in the denoising process, use the inpaint mask as-is.
-            return self._inpaint_mask
-        else:
-            # After the cut-off, promote all non-zero mask values to 1.0.
-            mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
-
-        return mask
-
-    def update_noise(
-        self, t_curr_latents: torch.Tensor, pred_noise: torch.Tensor, t_curr: float, t_prev: float
-    ) -> torch.Tensor:
-        # Handle gradient cutoff.
-        mask = self._apply_mask_gradient_adjustment(t_prev)
-
-        # NOTE(ryand): During inpainting, it is common to guide the denoising process by noising the initial latents for
-        # the current timestep and then blending the predicted intermediate latents with the noised initial latents.
-        # For example:
-        # ```
-        # noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
-        # return t_prev_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
-        # ```
-        # Instead of guiding based on the noised initial latents, we have decided to guide based on the noise prediction
-        # that points towards the initial latents. The difference between these guidance strategies is minor, but
-        # qualitatively we found the latter to produce slightly better results. When change_ratio is 0.0 or 1.0 there is
-        # no difference between the two strategies.
-        #
-        # We experimented with a number of related guidance strategies, but not exhaustively. It's entirely possible
-        # that there's a much better way to do this.
-        #
-        # Update: This decision was made when we were relying more heavily on this guidance strategy. Now that it is
-        # only really applied to the inpaint gradient regions, this decision could be re-visited.
-
-        # Calculate noise guidance
-        # What noise should the model have predicted at this timestep to step towards self._init_latents?
-        # Derivation:
-        # > t_prev_latents = t_curr_latents + (t_prev - t_curr) * pred_noise
-        # > t_0_latents = t_curr_latents + (0 - t_curr) * init_traj_noise
-        # > t_0_latents = t_curr_latents - t_curr * init_traj_noise
-        # > init_traj_noise = (t_curr_latents - t_0_latents) / t_curr)
-        init_traj_noise = (t_curr_latents - self._init_latents) / t_curr
-
-        # Blend the init_traj_noise with the pred_noise according to the inpaint mask and the trajectory guidance.
-        noise = pred_noise * mask + init_traj_noise * (1.0 - mask)
-
-        return noise