mirror of
https://github.com/invoke-ai/InvokeAI.git
synced 2026-02-13 18:25:28 -05:00
test fixes
fix test fix 2 fix 3 fix 4 yet another attempt new fix pray more pray lol
This commit is contained in:
committed by
psychedelicious
parent
e73150c3e6
commit
71e6f00e10
@@ -391,28 +391,29 @@ class FluxDenoiseInvocation(BaseInvocation):
|
||||
raise ValueError("A VAE (e.g., controlnet_vae) must be provided to use Kontext conditioning.")
|
||||
|
||||
kontext_extension = KontextExtension(
|
||||
kontext_field=self.kontext_conditioning,
|
||||
context=context,
|
||||
kontext_conditioning=self.kontext_conditioning,
|
||||
vae_field=self.controlnet_vae,
|
||||
device=TorchDevice.choose_torch_device(),
|
||||
dtype=inference_dtype,
|
||||
)
|
||||
|
||||
final_img, final_img_ids = x, img_ids
|
||||
original_seq_len = x.shape[1]
|
||||
# Prepare Kontext conditioning if provided
|
||||
img_cond_seq = None
|
||||
img_cond_seq_ids = None
|
||||
if kontext_extension is not None:
|
||||
final_img, final_img_ids = kontext_extension.apply(final_img, final_img_ids)
|
||||
# Ensure batch sizes match
|
||||
kontext_extension.ensure_batch_size(x.shape[0])
|
||||
img_cond_seq, img_cond_seq_ids = kontext_extension.kontext_latents, kontext_extension.kontext_ids
|
||||
|
||||
x = denoise(
|
||||
model=transformer,
|
||||
img=final_img,
|
||||
img_ids=final_img_ids,
|
||||
img=x,
|
||||
img_ids=img_ids,
|
||||
pos_regional_prompting_extension=pos_regional_prompting_extension,
|
||||
neg_regional_prompting_extension=neg_regional_prompting_extension,
|
||||
timesteps=timesteps,
|
||||
step_callback=self._build_step_callback(
|
||||
context, original_seq_len if kontext_extension is not None else None
|
||||
),
|
||||
step_callback=self._build_step_callback(context),
|
||||
guidance=self.guidance,
|
||||
cfg_scale=cfg_scale,
|
||||
inpaint_extension=inpaint_extension,
|
||||
@@ -420,11 +421,10 @@ class FluxDenoiseInvocation(BaseInvocation):
|
||||
pos_ip_adapter_extensions=pos_ip_adapter_extensions,
|
||||
neg_ip_adapter_extensions=neg_ip_adapter_extensions,
|
||||
img_cond=img_cond,
|
||||
img_cond_seq=img_cond_seq,
|
||||
img_cond_seq_ids=img_cond_seq_ids,
|
||||
)
|
||||
|
||||
if kontext_extension is not None:
|
||||
x = x[:, :original_seq_len, :] # Keep only the first original_seq_len tokens
|
||||
|
||||
x = unpack(x.float(), self.height, self.width)
|
||||
return x
|
||||
|
||||
@@ -896,13 +896,12 @@ class FluxDenoiseInvocation(BaseInvocation):
|
||||
del lora_info
|
||||
|
||||
def _build_step_callback(
|
||||
self, context: InvocationContext, original_seq_len: Optional[int] = None
|
||||
self, context: InvocationContext
|
||||
) -> Callable[[PipelineIntermediateState], None]:
|
||||
def step_callback(state: PipelineIntermediateState) -> None:
|
||||
# Extract only main image tokens if Kontext conditioning was applied
|
||||
# The denoise function now handles Kontext conditioning correctly,
|
||||
# so we don't need to slice the latents here
|
||||
latents = state.latents.float()
|
||||
if original_seq_len is not None:
|
||||
latents = latents[:, :original_seq_len, :]
|
||||
state.latents = unpack(latents, self.height, self.width).squeeze()
|
||||
context.util.flux_step_callback(state)
|
||||
|
||||
|
||||
@@ -30,8 +30,11 @@ def denoise(
|
||||
controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension],
|
||||
pos_ip_adapter_extensions: list[XLabsIPAdapterExtension],
|
||||
neg_ip_adapter_extensions: list[XLabsIPAdapterExtension],
|
||||
# extra img tokens
|
||||
# extra img tokens (channel-wise)
|
||||
img_cond: torch.Tensor | None,
|
||||
# extra img tokens (sequence-wise) - for Kontext conditioning
|
||||
img_cond_seq: torch.Tensor | None = None,
|
||||
img_cond_seq_ids: torch.Tensor | None = None,
|
||||
):
|
||||
# step 0 is the initial state
|
||||
total_steps = len(timesteps) - 1
|
||||
@@ -46,6 +49,10 @@ def denoise(
|
||||
)
|
||||
# guidance_vec is ignored for schnell.
|
||||
guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
|
||||
|
||||
# Store original sequence length for slicing predictions
|
||||
original_seq_len = img.shape[1]
|
||||
|
||||
for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))):
|
||||
t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
|
||||
|
||||
@@ -71,10 +78,24 @@ def denoise(
|
||||
# controlnet_residuals datastructure is efficient in that it likely contains multiple references to the same
|
||||
# tensors. Calculating the sum materializes each tensor into its own instance.
|
||||
merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals)
|
||||
pred_img = torch.cat((img, img_cond), dim=-1) if img_cond is not None else img
|
||||
|
||||
# Prepare input for model - concatenate fresh each step
|
||||
img_input = img
|
||||
img_input_ids = img_ids
|
||||
|
||||
# Add channel-wise conditioning (for ControlNet, FLUX Fill, etc.)
|
||||
if img_cond is not None:
|
||||
img_input = torch.cat((img_input, img_cond), dim=-1)
|
||||
|
||||
# Add sequence-wise conditioning (for Kontext)
|
||||
if img_cond_seq is not None:
|
||||
assert img_cond_seq_ids is not None, "You need to provide either both or neither of the sequence conditioning"
|
||||
img_input = torch.cat((img_input, img_cond_seq), dim=1)
|
||||
img_input_ids = torch.cat((img_input_ids, img_cond_seq_ids), dim=1)
|
||||
|
||||
pred = model(
|
||||
img=pred_img,
|
||||
img_ids=img_ids,
|
||||
img=img_input,
|
||||
img_ids=img_input_ids,
|
||||
txt=pos_regional_prompting_extension.regional_text_conditioning.t5_embeddings,
|
||||
txt_ids=pos_regional_prompting_extension.regional_text_conditioning.t5_txt_ids,
|
||||
y=pos_regional_prompting_extension.regional_text_conditioning.clip_embeddings,
|
||||
@@ -87,6 +108,10 @@ def denoise(
|
||||
ip_adapter_extensions=pos_ip_adapter_extensions,
|
||||
regional_prompting_extension=pos_regional_prompting_extension,
|
||||
)
|
||||
|
||||
# Slice prediction to only include the main image tokens
|
||||
if img_input_ids is not None:
|
||||
pred = pred[:, :original_seq_len]
|
||||
|
||||
step_cfg_scale = cfg_scale[step_index]
|
||||
|
||||
|
||||
@@ -1,13 +1,15 @@
|
||||
import einops
|
||||
import torch
|
||||
from einops import repeat
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
|
||||
from invokeai.app.invocations.fields import FluxKontextConditioningField
|
||||
from invokeai.app.invocations.flux_vae_encode import FluxVaeEncodeInvocation
|
||||
from invokeai.app.invocations.model import VAEField
|
||||
from invokeai.app.services.shared.invocation_context import InvocationContext
|
||||
from invokeai.backend.flux.sampling_utils import pack
|
||||
from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor
|
||||
from invokeai.backend.flux.util import PREFERED_KONTEXT_RESOLUTIONS
|
||||
|
||||
|
||||
def generate_img_ids_with_offset(
|
||||
@@ -71,7 +73,7 @@ class KontextExtension:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
kontext_field: FluxKontextConditioningField,
|
||||
kontext_conditioning: FluxKontextConditioningField,
|
||||
context: InvocationContext,
|
||||
vae_field: VAEField,
|
||||
device: torch.device,
|
||||
@@ -85,30 +87,53 @@ class KontextExtension:
|
||||
self._device = device
|
||||
self._dtype = dtype
|
||||
self._vae_field = vae_field
|
||||
self.kontext_field = kontext_field
|
||||
self.kontext_conditioning = kontext_conditioning
|
||||
|
||||
# Pre-process and cache the kontext latents and ids upon initialization.
|
||||
self.kontext_latents, self.kontext_ids = self._prepare_kontext()
|
||||
|
||||
def _prepare_kontext(self) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Encodes the reference image and prepares its latents and IDs."""
|
||||
image = self._context.images.get_pil(self.kontext_field.image.image_name)
|
||||
|
||||
# Reuse VAE encoding logic from FluxVaeEncodeInvocation
|
||||
vae_info = self._context.models.load(self._vae_field.vae)
|
||||
image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
|
||||
if image_tensor.dim() == 3:
|
||||
image_tensor = einops.rearrange(image_tensor, "c h w -> 1 c h w")
|
||||
image = self._context.images.get_pil(self.kontext_conditioning.image.image_name)
|
||||
|
||||
# Calculate aspect ratio of input image
|
||||
width, height = image.size
|
||||
aspect_ratio = width / height
|
||||
|
||||
# Find the closest preferred resolution by aspect ratio
|
||||
_, target_width, target_height = min(
|
||||
((abs(aspect_ratio - w / h), w, h) for w, h in PREFERED_KONTEXT_RESOLUTIONS),
|
||||
key=lambda x: x[0]
|
||||
)
|
||||
|
||||
# Apply BFL's scaling formula
|
||||
# This ensures compatibility with the model's training
|
||||
scaled_width = 2 * int(target_width / 16)
|
||||
scaled_height = 2 * int(target_height / 16)
|
||||
|
||||
# Resize to the exact resolution used during training
|
||||
image = image.convert("RGB")
|
||||
final_width = 8 * scaled_width
|
||||
final_height = 8 * scaled_height
|
||||
image = image.resize((final_width, final_height), Image.Resampling.LANCZOS)
|
||||
|
||||
# Convert to tensor with same normalization as BFL
|
||||
image_np = np.array(image)
|
||||
image_tensor = torch.from_numpy(image_np).float() / 127.5 - 1.0
|
||||
image_tensor = einops.rearrange(image_tensor, "h w c -> 1 c h w")
|
||||
image_tensor = image_tensor.to(self._device)
|
||||
|
||||
kontext_latents_unpacked = FluxVaeEncodeInvocation.vae_encode(vae_info=vae_info, image_tensor=image_tensor)
|
||||
|
||||
# Extract tensor dimensions with descriptive names
|
||||
# Latent tensor shape: [batch_size, channels, latent_height, latent_width]
|
||||
|
||||
# Continue with VAE encoding
|
||||
vae_info = self._context.models.load(self._vae_field.vae)
|
||||
kontext_latents_unpacked = FluxVaeEncodeInvocation.vae_encode(
|
||||
vae_info=vae_info,
|
||||
image_tensor=image_tensor
|
||||
)
|
||||
|
||||
# Extract tensor dimensions
|
||||
batch_size, _, latent_height, latent_width = kontext_latents_unpacked.shape
|
||||
|
||||
# Pack the latents and generate IDs. The idx_offset distinguishes these
|
||||
# tokens from the main image's tokens, which have an index of 0.
|
||||
|
||||
# Pack the latents and generate IDs
|
||||
kontext_latents_packed = pack(kontext_latents_unpacked).to(self._device, self._dtype)
|
||||
kontext_ids = generate_img_ids_with_offset(
|
||||
latent_height=latent_height,
|
||||
@@ -116,24 +141,13 @@ class KontextExtension:
|
||||
batch_size=batch_size,
|
||||
device=self._device,
|
||||
dtype=self._dtype,
|
||||
idx_offset=1, # Distinguishes reference tokens from main image tokens
|
||||
idx_offset=1,
|
||||
)
|
||||
|
||||
|
||||
return kontext_latents_packed, kontext_ids
|
||||
|
||||
def apply(
|
||||
self,
|
||||
img: torch.Tensor,
|
||||
img_ids: torch.Tensor,
|
||||
) -> tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Concatenates the pre-processed kontext data to the main image sequence."""
|
||||
# Ensure batch sizes match, repeating kontext data if necessary for batch operations.
|
||||
if img.shape[0] != self.kontext_latents.shape[0]:
|
||||
self.kontext_latents = self.kontext_latents.repeat(img.shape[0], 1, 1)
|
||||
self.kontext_ids = self.kontext_ids.repeat(img.shape[0], 1, 1)
|
||||
|
||||
# Concatenate along the sequence dimension (dim=1)
|
||||
combined_img = torch.cat([img, self.kontext_latents], dim=1)
|
||||
combined_img_ids = torch.cat([img_ids, self.kontext_ids], dim=1)
|
||||
|
||||
return combined_img, combined_img_ids
|
||||
def ensure_batch_size(self, target_batch_size: int) -> None:
|
||||
"""Ensures the kontext latents and IDs match the target batch size by repeating if necessary."""
|
||||
if self.kontext_latents.shape[0] != target_batch_size:
|
||||
self.kontext_latents = self.kontext_latents.repeat(target_batch_size, 1, 1)
|
||||
self.kontext_ids = self.kontext_ids.repeat(target_batch_size, 1, 1)
|
||||
|
||||
@@ -174,11 +174,13 @@ def generate_img_ids(h: int, w: int, batch_size: int, device: torch.device, dtyp
|
||||
dtype = torch.float16
|
||||
|
||||
img_ids = torch.zeros(h // 2, w // 2, 3, device=device, dtype=dtype)
|
||||
# Set batch offset to 0 for main image tokens
|
||||
img_ids[..., 0] = 0
|
||||
img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2, device=device, dtype=dtype)[:, None]
|
||||
img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2, device=device, dtype=dtype)[None, :]
|
||||
img_ids = repeat(img_ids, "h w c -> b (h w) c", b=batch_size)
|
||||
|
||||
if device.type == "mps":
|
||||
img_ids.to(orig_dtype)
|
||||
img_ids = img_ids.to(orig_dtype)
|
||||
|
||||
return img_ids
|
||||
|
||||
@@ -18,6 +18,17 @@ class ModelSpec:
|
||||
repo_ae: str | None
|
||||
|
||||
|
||||
# Preferred resolutions for Kontext models to avoid tiling artifacts
|
||||
# These are the specific resolutions the model was trained on
|
||||
PREFERED_KONTEXT_RESOLUTIONS = [
|
||||
(672, 1568), (688, 1504), (720, 1456), (752, 1392),
|
||||
(800, 1328), (832, 1248), (880, 1184), (944, 1104),
|
||||
(1024, 1024), (1104, 944), (1184, 880), (1248, 832),
|
||||
(1328, 800), (1392, 752), (1456, 720), (1504, 688),
|
||||
(1568, 672),
|
||||
]
|
||||
|
||||
|
||||
max_seq_lengths: Dict[str, Literal[256, 512]] = {
|
||||
"flux-dev": 512,
|
||||
"flux-dev-fill": 512,
|
||||
|
||||
Reference in New Issue
Block a user