Remove FLUX TrajectoryGuidanceExtension and revert to the InpaintExtension. Keep the improved inpaint gradient mask adjustment behaviour.

This commit is contained in:
Ryan Dick
2024-09-25 15:14:11 +00:00
committed by Kent Keirsey
parent 8d50ecdfc3
commit bdeec54886
4 changed files with 37 additions and 94 deletions

View File

@@ -20,6 +20,7 @@ from invokeai.app.invocations.model import TransformerField
from invokeai.app.invocations.primitives import LatentsOutput
from invokeai.app.services.shared.invocation_context import InvocationContext
from invokeai.backend.flux.denoise import denoise
from invokeai.backend.flux.inpaint_extension import InpaintExtension
from invokeai.backend.flux.model import Flux
from invokeai.backend.flux.sampling_utils import (
clip_timestep_schedule_fractional,
@@ -29,7 +30,6 @@ from invokeai.backend.flux.sampling_utils import (
pack,
unpack,
)
from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
from invokeai.backend.lora.lora_patcher import LoRAPatcher
from invokeai.backend.model_manager.config import ModelFormat
@@ -181,12 +181,14 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
# Now that we have 'packed' the latent tensors, verify that we calculated the image_seq_len correctly.
assert image_seq_len == x.shape[1]
# Prepare trajectory guidance extension.
traj_guidance_extension: TrajectoryGuidanceExtension | None = None
if init_latents is not None:
traj_guidance_extension = TrajectoryGuidanceExtension(
# Prepare inpaint extension.
inpaint_extension: InpaintExtension | None = None
if inpaint_mask is not None:
assert init_latents is not None
inpaint_extension = InpaintExtension(
init_latents=init_latents,
inpaint_mask=inpaint_mask,
noise=noise,
)
with (
@@ -234,7 +236,7 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
timesteps=timesteps,
step_callback=self._build_step_callback(context),
guidance=self.guidance,
traj_guidance_extension=traj_guidance_extension,
inpaint_extension=inpaint_extension,
)
x = unpack(x.float(), self.height, self.width)

View File

@@ -3,8 +3,8 @@ from typing import Callable
import torch
from tqdm import tqdm
from invokeai.backend.flux.inpaint_extension import InpaintExtension
from invokeai.backend.flux.model import Flux
from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
@@ -20,7 +20,7 @@ def denoise(
timesteps: list[float],
step_callback: Callable[[PipelineIntermediateState], None],
guidance: float,
traj_guidance_extension: TrajectoryGuidanceExtension | None, # noqa: F821
inpaint_extension: InpaintExtension | None,
):
# step 0 is the initial state
total_steps = len(timesteps) - 1
@@ -48,14 +48,13 @@ def denoise(
guidance=guidance_vec,
)
if traj_guidance_extension is not None:
pred = traj_guidance_extension.update_noise(
t_curr_latents=img, pred_noise=pred, t_curr=t_curr, t_prev=t_prev
)
preview_img = img - t_curr * pred
img = img + (t_prev - t_curr) * pred
if inpaint_extension is not None:
img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
step_callback(
PipelineIntermediateState(
step=step,

View File

@@ -19,8 +19,26 @@ class InpaintExtension:
self._inpaint_mask = inpaint_mask
self._noise = noise
def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
"""Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
# As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
# 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
# number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
# threshold worked well.
# We use a small epsilon to avoid any potential issues with floating point precision.
eps = 1e-4
mask_gradient_t_cutoff = 0.5
if t_prev > mask_gradient_t_cutoff:
# Early in the denoising process, use the inpaint mask as-is.
return self._inpaint_mask
else:
# After the cut-off, promote all non-zero mask values to 1.0.
mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
return mask
def merge_intermediate_latents_with_init_latents(
self, intermediate_latents: torch.Tensor, timestep: float
self, intermediate_latents: torch.Tensor, t_prev: float
) -> torch.Tensor:
"""Merge the intermediate latents with the initial latents for the current timestep using the inpaint mask. I.e.
update the intermediate latents to keep the regions that are not being inpainted on the correct noise
@@ -28,8 +46,10 @@ class InpaintExtension:
This function should be called after each denoising step.
"""
mask = self._apply_mask_gradient_adjustment(t_prev)
# Noise the init latents for the current timestep.
noised_init_latents = self._noise * timestep + (1.0 - timestep) * self._init_latents
noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
# Merge the intermediate latents with the noised_init_latents using the inpaint_mask.
return intermediate_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
return intermediate_latents * mask + noised_init_latents * (1.0 - mask)

View File

@@ -1,78 +0,0 @@
import torch
class TrajectoryGuidanceExtension:
"""An implementation of trajectory guidance for FLUX."""
def __init__(self, init_latents: torch.Tensor, inpaint_mask: torch.Tensor | None):
"""Initialize TrajectoryGuidanceExtension.
Args:
init_latents (torch.Tensor): The initial latents (i.e. un-noised at timestep 0). In 'packed' format.
inpaint_mask (torch.Tensor | None): A mask specifying which elements to inpaint. Range [0, 1]. Values of 1
will be re-generated. Values of 0 will remain unchanged. Values between 0 and 1 can be used to blend the
inpainted region with the background. In 'packed' format. If None, will be treated as a mask of all 1s.
"""
self._init_latents = init_latents
if inpaint_mask is None:
# The inpaing mask is None, so we initialize a mask with a single value of 1.0.
# This value will be broadcasted and treated as a mask of all 1s.
self._inpaint_mask = torch.ones(1, device=init_latents.device, dtype=init_latents.dtype)
else:
self._inpaint_mask = inpaint_mask
def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
"""Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
# As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
# 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
# number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
# threshold worked well.
# We use a small epsilon to avoid any potential issues with floating point precision.
eps = 1e-4
mask_gradient_t_cutoff = 0.5
if t_prev > mask_gradient_t_cutoff:
# Early in the denoising process, use the inpaint mask as-is.
return self._inpaint_mask
else:
# After the cut-off, promote all non-zero mask values to 1.0.
mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
return mask
def update_noise(
self, t_curr_latents: torch.Tensor, pred_noise: torch.Tensor, t_curr: float, t_prev: float
) -> torch.Tensor:
# Handle gradient cutoff.
mask = self._apply_mask_gradient_adjustment(t_prev)
# NOTE(ryand): During inpainting, it is common to guide the denoising process by noising the initial latents for
# the current timestep and then blending the predicted intermediate latents with the noised initial latents.
# For example:
# ```
# noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
# return t_prev_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
# ```
# Instead of guiding based on the noised initial latents, we have decided to guide based on the noise prediction
# that points towards the initial latents. The difference between these guidance strategies is minor, but
# qualitatively we found the latter to produce slightly better results. When change_ratio is 0.0 or 1.0 there is
# no difference between the two strategies.
#
# We experimented with a number of related guidance strategies, but not exhaustively. It's entirely possible
# that there's a much better way to do this.
#
# Update: This decision was made when we were relying more heavily on this guidance strategy. Now that it is
# only really applied to the inpaint gradient regions, this decision could be re-visited.
# Calculate noise guidance
# What noise should the model have predicted at this timestep to step towards self._init_latents?
# Derivation:
# > t_prev_latents = t_curr_latents + (t_prev - t_curr) * pred_noise
# > t_0_latents = t_curr_latents + (0 - t_curr) * init_traj_noise
# > t_0_latents = t_curr_latents - t_curr * init_traj_noise
# > init_traj_noise = (t_curr_latents - t_0_latents) / t_curr)
init_traj_noise = (t_curr_latents - self._init_latents) / t_curr
# Blend the init_traj_noise with the pred_noise according to the inpaint mask and the trajectory guidance.
noise = pred_noise * mask + init_traj_noise * (1.0 - mask)
return noise