mirror of
https://github.com/invoke-ai/InvokeAI.git
synced 2026-04-23 03:00:31 -04:00
Remove FLUX TrajectoryGuidanceExtension and revert to the InpaintExtension. Keep the improved inpaint gradient mask adjustment behaviour.
This commit is contained in:
@@ -20,6 +20,7 @@ from invokeai.app.invocations.model import TransformerField
|
||||
from invokeai.app.invocations.primitives import LatentsOutput
|
||||
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||
from invokeai.backend.flux.denoise import denoise
|
||||
from invokeai.backend.flux.inpaint_extension import InpaintExtension
|
||||
from invokeai.backend.flux.model import Flux
|
||||
from invokeai.backend.flux.sampling_utils import (
|
||||
clip_timestep_schedule_fractional,
|
||||
@@ -29,7 +30,6 @@ from invokeai.backend.flux.sampling_utils import (
|
||||
pack,
|
||||
unpack,
|
||||
)
|
||||
from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
|
||||
from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
|
||||
from invokeai.backend.lora.lora_patcher import LoRAPatcher
|
||||
from invokeai.backend.model_manager.config import ModelFormat
|
||||
@@ -181,12 +181,14 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
|
||||
# Now that we have 'packed' the latent tensors, verify that we calculated the image_seq_len correctly.
|
||||
assert image_seq_len == x.shape[1]
|
||||
|
||||
# Prepare trajectory guidance extension.
|
||||
traj_guidance_extension: TrajectoryGuidanceExtension | None = None
|
||||
if init_latents is not None:
|
||||
traj_guidance_extension = TrajectoryGuidanceExtension(
|
||||
# Prepare inpaint extension.
|
||||
inpaint_extension: InpaintExtension | None = None
|
||||
if inpaint_mask is not None:
|
||||
assert init_latents is not None
|
||||
inpaint_extension = InpaintExtension(
|
||||
init_latents=init_latents,
|
||||
inpaint_mask=inpaint_mask,
|
||||
noise=noise,
|
||||
)
|
||||
|
||||
with (
|
||||
@@ -234,7 +236,7 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
|
||||
timesteps=timesteps,
|
||||
step_callback=self._build_step_callback(context),
|
||||
guidance=self.guidance,
|
||||
traj_guidance_extension=traj_guidance_extension,
|
||||
inpaint_extension=inpaint_extension,
|
||||
)
|
||||
|
||||
x = unpack(x.float(), self.height, self.width)
|
||||
|
||||
@@ -3,8 +3,8 @@ from typing import Callable
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from invokeai.backend.flux.inpaint_extension import InpaintExtension
|
||||
from invokeai.backend.flux.model import Flux
|
||||
from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
|
||||
from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ def denoise(
|
||||
timesteps: list[float],
|
||||
step_callback: Callable[[PipelineIntermediateState], None],
|
||||
guidance: float,
|
||||
traj_guidance_extension: TrajectoryGuidanceExtension | None, # noqa: F821
|
||||
inpaint_extension: InpaintExtension | None,
|
||||
):
|
||||
# step 0 is the initial state
|
||||
total_steps = len(timesteps) - 1
|
||||
@@ -48,14 +48,13 @@ def denoise(
|
||||
guidance=guidance_vec,
|
||||
)
|
||||
|
||||
if traj_guidance_extension is not None:
|
||||
pred = traj_guidance_extension.update_noise(
|
||||
t_curr_latents=img, pred_noise=pred, t_curr=t_curr, t_prev=t_prev
|
||||
)
|
||||
|
||||
preview_img = img - t_curr * pred
|
||||
img = img + (t_prev - t_curr) * pred
|
||||
|
||||
if inpaint_extension is not None:
|
||||
img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
|
||||
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
|
||||
|
||||
step_callback(
|
||||
PipelineIntermediateState(
|
||||
step=step,
|
||||
|
||||
@@ -19,8 +19,26 @@ class InpaintExtension:
|
||||
self._inpaint_mask = inpaint_mask
|
||||
self._noise = noise
|
||||
|
||||
def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
|
||||
"""Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
|
||||
# As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
|
||||
# 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
|
||||
# number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
|
||||
# threshold worked well.
|
||||
# We use a small epsilon to avoid any potential issues with floating point precision.
|
||||
eps = 1e-4
|
||||
mask_gradient_t_cutoff = 0.5
|
||||
if t_prev > mask_gradient_t_cutoff:
|
||||
# Early in the denoising process, use the inpaint mask as-is.
|
||||
return self._inpaint_mask
|
||||
else:
|
||||
# After the cut-off, promote all non-zero mask values to 1.0.
|
||||
mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
|
||||
|
||||
return mask
|
||||
|
||||
def merge_intermediate_latents_with_init_latents(
|
||||
self, intermediate_latents: torch.Tensor, timestep: float
|
||||
self, intermediate_latents: torch.Tensor, t_prev: float
|
||||
) -> torch.Tensor:
|
||||
"""Merge the intermediate latents with the initial latents for the current timestep using the inpaint mask. I.e.
|
||||
update the intermediate latents to keep the regions that are not being inpainted on the correct noise
|
||||
@@ -28,8 +46,10 @@ class InpaintExtension:
|
||||
|
||||
This function should be called after each denoising step.
|
||||
"""
|
||||
mask = self._apply_mask_gradient_adjustment(t_prev)
|
||||
|
||||
# Noise the init latents for the current timestep.
|
||||
noised_init_latents = self._noise * timestep + (1.0 - timestep) * self._init_latents
|
||||
noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
|
||||
|
||||
# Merge the intermediate latents with the noised_init_latents using the inpaint_mask.
|
||||
return intermediate_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
|
||||
return intermediate_latents * mask + noised_init_latents * (1.0 - mask)
|
||||
|
||||
@@ -1,78 +0,0 @@
|
||||
import torch
|
||||
|
||||
|
||||
class TrajectoryGuidanceExtension:
|
||||
"""An implementation of trajectory guidance for FLUX."""
|
||||
|
||||
def __init__(self, init_latents: torch.Tensor, inpaint_mask: torch.Tensor | None):
|
||||
"""Initialize TrajectoryGuidanceExtension.
|
||||
|
||||
Args:
|
||||
init_latents (torch.Tensor): The initial latents (i.e. un-noised at timestep 0). In 'packed' format.
|
||||
inpaint_mask (torch.Tensor | None): A mask specifying which elements to inpaint. Range [0, 1]. Values of 1
|
||||
will be re-generated. Values of 0 will remain unchanged. Values between 0 and 1 can be used to blend the
|
||||
inpainted region with the background. In 'packed' format. If None, will be treated as a mask of all 1s.
|
||||
"""
|
||||
self._init_latents = init_latents
|
||||
if inpaint_mask is None:
|
||||
# The inpaing mask is None, so we initialize a mask with a single value of 1.0.
|
||||
# This value will be broadcasted and treated as a mask of all 1s.
|
||||
self._inpaint_mask = torch.ones(1, device=init_latents.device, dtype=init_latents.dtype)
|
||||
else:
|
||||
self._inpaint_mask = inpaint_mask
|
||||
|
||||
def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
|
||||
"""Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
|
||||
# As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
|
||||
# 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
|
||||
# number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
|
||||
# threshold worked well.
|
||||
# We use a small epsilon to avoid any potential issues with floating point precision.
|
||||
eps = 1e-4
|
||||
mask_gradient_t_cutoff = 0.5
|
||||
if t_prev > mask_gradient_t_cutoff:
|
||||
# Early in the denoising process, use the inpaint mask as-is.
|
||||
return self._inpaint_mask
|
||||
else:
|
||||
# After the cut-off, promote all non-zero mask values to 1.0.
|
||||
mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
|
||||
|
||||
return mask
|
||||
|
||||
def update_noise(
|
||||
self, t_curr_latents: torch.Tensor, pred_noise: torch.Tensor, t_curr: float, t_prev: float
|
||||
) -> torch.Tensor:
|
||||
# Handle gradient cutoff.
|
||||
mask = self._apply_mask_gradient_adjustment(t_prev)
|
||||
|
||||
# NOTE(ryand): During inpainting, it is common to guide the denoising process by noising the initial latents for
|
||||
# the current timestep and then blending the predicted intermediate latents with the noised initial latents.
|
||||
# For example:
|
||||
# ```
|
||||
# noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
|
||||
# return t_prev_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
|
||||
# ```
|
||||
# Instead of guiding based on the noised initial latents, we have decided to guide based on the noise prediction
|
||||
# that points towards the initial latents. The difference between these guidance strategies is minor, but
|
||||
# qualitatively we found the latter to produce slightly better results. When change_ratio is 0.0 or 1.0 there is
|
||||
# no difference between the two strategies.
|
||||
#
|
||||
# We experimented with a number of related guidance strategies, but not exhaustively. It's entirely possible
|
||||
# that there's a much better way to do this.
|
||||
#
|
||||
# Update: This decision was made when we were relying more heavily on this guidance strategy. Now that it is
|
||||
# only really applied to the inpaint gradient regions, this decision could be re-visited.
|
||||
|
||||
# Calculate noise guidance
|
||||
# What noise should the model have predicted at this timestep to step towards self._init_latents?
|
||||
# Derivation:
|
||||
# > t_prev_latents = t_curr_latents + (t_prev - t_curr) * pred_noise
|
||||
# > t_0_latents = t_curr_latents + (0 - t_curr) * init_traj_noise
|
||||
# > t_0_latents = t_curr_latents - t_curr * init_traj_noise
|
||||
# > init_traj_noise = (t_curr_latents - t_0_latents) / t_curr)
|
||||
init_traj_noise = (t_curr_latents - self._init_latents) / t_curr
|
||||
|
||||
# Blend the init_traj_noise with the pred_noise according to the inpaint mask and the trajectory guidance.
|
||||
noise = pred_noise * mask + init_traj_noise * (1.0 - mask)
|
||||
|
||||
return noise
|
||||
Reference in New Issue
Block a user