Make T2I Adapters work with any resolution supported by the models (#7215)

## Summary This change mimics the unet padding strategy to align T2I featuremaps with the latents during denoising. It also slightly adjusts the crop and scale logic so that the control will match the input image without shifting when it needs to pad. ## Related Issues / Discussions  ## QA Instructions Image generated at 1032x1024 ![image](https://github.com/user-attachments/assets/7ea579e4-61dc-4b6b-aa84-33d676d160c6) Image generated at 1080x1040 to prove feature alignment. ![image](https://github.com/user-attachments/assets/ee6e5b6a-d0d5-474d-9fc4-f65c104964bd) Edge artifacts on the bottom and right are a result of SDXL's unet padding, and t2i influence will be cut off in those regions. ## Merge Plan Contingent on #7205 Currently the Canvas UI prevents users from generating non-64 resolutions while t2i adapter layers are active. Will leave this as a draft until fixing that. ## Checklist - [x] _The PR has a short but descriptive title, suitable for a changelog_ - [ ] _Tests added / updated (if applicable)_ - [ ] _Documentation added / updated (if applicable)_
2026-04-23 03:00:31 -04:00 · 2024-11-01 13:22:00 +11:00
parent 26f95d6a97 6fbc019142
commit 016a6f182f
3 changed files with 35 additions and 49 deletions
--- a/invokeai/app/invocations/denoise_latents.py
+++ b/invokeai/app/invocations/denoise_latents.py
@@ -622,7 +622,7 @@ class DenoiseLatentsInvocation(BaseInvocation):
        for t2i_adapter_field in t2i_adapter:
            t2i_adapter_model_config = context.models.get_config(t2i_adapter_field.t2i_adapter_model.key)
            t2i_adapter_loaded_model = context.models.load(t2i_adapter_field.t2i_adapter_model)
-            image = context.images.get_pil(t2i_adapter_field.image.image_name)
+            image = context.images.get_pil(t2i_adapter_field.image.image_name, mode="RGB")

            # The max_unet_downscale is the maximum amount that the UNet model downscales the latent image internally.
            if t2i_adapter_model_config.base == BaseModelType.StableDiffusion1:
@@ -640,29 +640,39 @@ class DenoiseLatentsInvocation(BaseInvocation):
            with t2i_adapter_loaded_model as t2i_adapter_model:
                total_downscale_factor = t2i_adapter_model.total_downscale_factor

-                # Resize the T2I-Adapter input image.
-                # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
-                # result will match the latent image's dimensions after max_unet_downscale is applied.
-                t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
-                t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
-
                # Note: We have hard-coded `do_classifier_free_guidance=False`. This is because we only want to prepare
                # a single image. If CFG is enabled, we will duplicate the resultant tensor after applying the
                # T2I-Adapter model.
                #
                # Note: We re-use the `prepare_control_image(...)` from ControlNet for T2I-Adapter, because it has many
                # of the same requirements (e.g. preserving binary masks during resize).
+
+                # Assuming fixed dimensional scaling of LATENT_SCALE_FACTOR.
+                _, _, latent_height, latent_width = latents_shape
+                control_height_resize = latent_height * LATENT_SCALE_FACTOR
+                control_width_resize = latent_width * LATENT_SCALE_FACTOR
                t2i_image = prepare_control_image(
                    image=image,
                    do_classifier_free_guidance=False,
-                    width=t2i_input_width,
-                    height=t2i_input_height,
+                    width=control_width_resize,
+                    height=control_height_resize,
                    num_channels=t2i_adapter_model.config["in_channels"],  # mypy treats this as a FrozenDict
                    device=t2i_adapter_model.device,
                    dtype=t2i_adapter_model.dtype,
                    resize_mode=t2i_adapter_field.resize_mode,
                )

+                # Resize the T2I-Adapter input image.
+                # We select the resize dimensions so that after the T2I-Adapter's total_downscale_factor is applied, the
+                # result will match the latent image's dimensions after max_unet_downscale is applied.
+                # We crop the image to this size so that the positions match the input image on non-standard resolutions
+                t2i_input_height = latents_shape[2] // max_unet_downscale * total_downscale_factor
+                t2i_input_width = latents_shape[3] // max_unet_downscale * total_downscale_factor
+                if t2i_image.shape[2] > t2i_input_height or t2i_image.shape[3] > t2i_input_width:
+                    t2i_image = t2i_image[
+                        :, :, : min(t2i_image.shape[2], t2i_input_height), : min(t2i_image.shape[3], t2i_input_width)
+                    ]
+
                adapter_state = t2i_adapter_model(t2i_image)

            if do_classifier_free_guidance:
--- a/invokeai/backend/stable_diffusion/diffusers_pipeline.py
+++ b/invokeai/backend/stable_diffusion/diffusers_pipeline.py
@@ -499,6 +499,22 @@ class StableDiffusionGeneratorPipeline(StableDiffusionPipeline):
                    for idx, value in enumerate(single_t2i_adapter_data.adapter_state):
                        accum_adapter_state[idx] += value * t2i_adapter_weight

+            # Hack: force compatibility with irregular resolutions by padding the feature map with zeros
+            for idx, tensor in enumerate(accum_adapter_state):
+                # The tensor size is supposed to be some integer downscale factor of the latents size.
+                # Internally, the unet will pad the latents before downscaling between levels when it is no longer divisible by its downscale factor.
+                # If the latent size does not scale down evenly, we need to pad the tensor so that it matches the the downscaled padded latents later on.
+                scale_factor = latents.size()[-1] // tensor.size()[-1]
+                required_padding_width = math.ceil(latents.size()[-1] / scale_factor) - tensor.size()[-1]
+                required_padding_height = math.ceil(latents.size()[-2] / scale_factor) - tensor.size()[-2]
+                tensor = torch.nn.functional.pad(
+                    tensor,
+                    (0, required_padding_width, 0, required_padding_height, 0, 0, 0, 0),
+                    mode="constant",
+                    value=0,
+                )
+                accum_adapter_state[idx] = tensor
+
            down_intrablock_additional_residuals = accum_adapter_state

        # Handle inpainting models.
--- a/invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts
+++ b/invokeai/frontend/web/src/common/hooks/useIsReadyToEnqueue.ts
@@ -202,46 +202,6 @@ const createSelector = (
            if (controlLayer.controlAdapter.model?.base !== model?.base) {
              problems.push(i18n.t('parameters.invoke.layer.controlAdapterIncompatibleBaseModel'));
            }
-            // T2I Adapters require images have dimensions that are multiples of 64 (SD1.5) or 32 (SDXL)
-            if (controlLayer.controlAdapter.type === 't2i_adapter') {
-              const multiple = model?.base === 'sdxl' ? 32 : 64;
-              if (bbox.scaleMethod === 'none') {
-                if (bbox.rect.width % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxWidth', {
-                      multiple,
-                      width: bbox.rect.width,
-                    }),
-                  });
-                }
-                if (bbox.rect.height % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleBboxHeight', {
-                      multiple,
-                      height: bbox.rect.height,
-                    }),
-                  });
-                }
-              } else {
-                if (bbox.scaledSize.width % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxWidth', {
-                      multiple,
-                      width: bbox.scaledSize.width,
-                    }),
-                  });
-                }
-                if (bbox.scaledSize.height % 16 !== 0) {
-                  reasons.push({
-                    content: i18n.t('parameters.invoke.layer.t2iAdapterIncompatibleScaledBboxHeight', {
-                      multiple,
-                      height: bbox.scaledSize.height,
-                    }),
-                  });
-                }
-              }
-            }
-
            if (problems.length) {
              const content = upperFirst(problems.join(', '));
              reasons.push({ prefix, content });