InvokeAI/invokeai/app/invocations/create_gradient_mask.py

from typing import Literal, Optional

import cv2
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image
from torchvision.transforms.functional import resize as tv_resize

from invokeai.app.invocations.baseinvocation import BaseInvocation, BaseInvocationOutput, invocation, invocation_output
from invokeai.app.invocations.constants import LATENT_SCALE_FACTOR
from invokeai.app.invocations.fields import (
    DenoiseMaskField,
    FieldDescriptions,
    ImageField,
    Input,
    InputField,
    OutputField,
)
from invokeai.app.invocations.image_to_latents import ImageToLatentsInvocation
from invokeai.app.invocations.model import UNetField, VAEField
from invokeai.app.services.shared.invocation_context import InvocationContext
from invokeai.backend.model_manager import LoadedModel
from invokeai.backend.model_manager.config import MainConfigBase
from invokeai.backend.model_manager.taxonomy import ModelVariantType
from invokeai.backend.stable_diffusion.diffusers_pipeline import image_resized_to_grid_as_tensor


@invocation_output("gradient_mask_output")
class GradientMaskOutput(BaseInvocationOutput):
    """Outputs a denoise mask and an image representing the total gradient of the mask."""

    denoise_mask: DenoiseMaskField = OutputField(
        description="Mask for denoise model run. Values of 0.0 represent the regions to be fully denoised, and 1.0 "
        + "represent the regions to be preserved."
    )
    expanded_mask_area: ImageField = OutputField(
        description="Image representing the total gradient area of the mask. For paste-back purposes."
    )


@invocation(
    "create_gradient_mask",
    title="Create Gradient Mask",
    tags=["mask", "denoise"],
    category="latents",
    version="1.3.0",
)
class CreateGradientMaskInvocation(BaseInvocation):
    """Creates mask for denoising."""

    mask: ImageField = InputField(description="Image which will be masked", ui_order=1)
    edge_radius: int = InputField(default=16, ge=0, description="How far to expand the edges of the mask", ui_order=2)
    coherence_mode: Literal["Gaussian Blur", "Box Blur", "Staged"] = InputField(default="Gaussian Blur", ui_order=3)
    minimum_denoise: float = InputField(
        default=0.0, ge=0, le=1, description="Minimum denoise level for the coherence region", ui_order=4
    )
    image: Optional[ImageField] = InputField(
        default=None,
        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
        title="[OPTIONAL] Image",
        ui_order=6,
    )
    unet: Optional[UNetField] = InputField(
        description="OPTIONAL: If the Unet is a specialized Inpainting model, masked_latents will be generated from the image with the VAE",
        default=None,
        input=Input.Connection,
        title="[OPTIONAL] UNet",
        ui_order=5,
    )
    vae: Optional[VAEField] = InputField(
        default=None,
        description="OPTIONAL: Only connect for specialized Inpainting models, masked_latents will be generated from the image with the VAE",
        title="[OPTIONAL] VAE",
        input=Input.Connection,
        ui_order=7,
    )
    tiled: bool = InputField(default=False, description=FieldDescriptions.tiled, ui_order=8)
    fp32: bool = InputField(default=False, description=FieldDescriptions.fp32, ui_order=9)

    @torch.no_grad()
    def invoke(self, context: InvocationContext) -> GradientMaskOutput:
        mask_image = context.images.get_pil(self.mask.image_name, mode="L")

        # Resize the mask_image. Makes the filter 64x faster and doesn't hurt quality in latent scale anyway
        mask_image = mask_image.resize(
            (
                mask_image.width // LATENT_SCALE_FACTOR,
                mask_image.height // LATENT_SCALE_FACTOR,
            ),
            resample=Image.Resampling.BILINEAR,
        )

        mask_np_orig = np.array(mask_image, dtype=np.float32)

        self.edge_radius = self.edge_radius // LATENT_SCALE_FACTOR  # scale the edge radius to match the mask size

        if self.edge_radius > 0:
            mask_np = 255 - mask_np_orig  # invert so 0 is unmasked (higher values = higher denoise strength)
            dilated_mask = mask_np.copy()

            # Create kernel based on coherence mode
            if self.coherence_mode == "Box Blur":
                # Create a circular distance kernel that fades from center outward
                kernel_size = self.edge_radius * 2 + 1
                center = self.edge_radius
                kernel = np.zeros((kernel_size, kernel_size), dtype=np.float32)
                for i in range(kernel_size):
                    for j in range(kernel_size):
                        dist = np.sqrt((i - center) ** 2 + (j - center) ** 2)
                        if dist <= self.edge_radius:
                            kernel[i, j] = 1.0 - (dist / self.edge_radius)
            else:  # Gaussian Blur or Staged
                # Create a Gaussian kernel
                kernel_size = self.edge_radius * 2 + 1
                kernel = cv2.getGaussianKernel(
                    kernel_size, self.edge_radius / 2.5
                )  # 2.5 is a magic number (standard deviation capturing)
                kernel = kernel * kernel.T  # Make 2D gaussian kernel
                kernel = kernel / np.max(kernel)  # Normalize center to 1.0

                # Ensure values outside radius are 0
                center = self.edge_radius
                for i in range(kernel_size):
                    for j in range(kernel_size):
                        dist = np.sqrt((i - center) ** 2 + (j - center) ** 2)
                        if dist > self.edge_radius:
                            kernel[i, j] = 0

            # 2D max filter
            mask_tensor = torch.tensor(mask_np)
            kernel_tensor = torch.tensor(kernel)
            dilated_mask = 255 - self.max_filter2D_torch(mask_tensor, kernel_tensor).cpu()
            dilated_mask = dilated_mask.numpy()

            threshold = (1 - self.minimum_denoise) * 255

            if self.coherence_mode == "Staged":
                # wherever expanded mask is darker than the original mask but original was above threshhold, set it to the threshold
                # makes any expansion areas drop to threshhold. Raising minimum across the image happen outside of this if
                threshold_mask = (dilated_mask < mask_np_orig) & (mask_np_orig > threshold)
                dilated_mask = np.where(threshold_mask, threshold, mask_np_orig)

            # wherever expanded mask is less than 255 but greater than threshold, drop it to threshold (minimum denoise)
            threshold_mask = (dilated_mask > threshold) & (dilated_mask < 255)
            dilated_mask = np.where(threshold_mask, threshold, dilated_mask)

        else:
            dilated_mask = mask_np_orig.copy()

        # convert to tensor
        dilated_mask = np.clip(dilated_mask, 0, 255).astype(np.uint8)
        mask_tensor = torch.tensor(dilated_mask, device=torch.device("cpu"))

        # binary mask for compositing
        expanded_mask = np.where((dilated_mask < 255), 0, 255)
        expanded_mask_image = Image.fromarray(expanded_mask.astype(np.uint8), mode="L")
        expanded_mask_image = expanded_mask_image.resize(
            (
                mask_image.width * LATENT_SCALE_FACTOR,
                mask_image.height * LATENT_SCALE_FACTOR,
            ),
            resample=Image.Resampling.NEAREST,
        )
        expanded_image_dto = context.images.save(expanded_mask_image)

        # restore the original mask size
        dilated_mask = Image.fromarray(dilated_mask.astype(np.uint8))
        dilated_mask = dilated_mask.resize(
            (
                mask_image.width * LATENT_SCALE_FACTOR,
                mask_image.height * LATENT_SCALE_FACTOR,
            ),
            resample=Image.Resampling.NEAREST,
        )

        # stack the mask as a tensor, repeating 4 times on dimmension 1
        dilated_mask_tensor = image_resized_to_grid_as_tensor(dilated_mask, normalize=False)
        mask_name = context.tensors.save(tensor=dilated_mask_tensor.unsqueeze(0))

        masked_latents_name = None
        if self.unet is not None and self.vae is not None and self.image is not None:
            # all three fields must be present at the same time
            main_model_config = context.models.get_config(self.unet.unet.key)
            assert isinstance(main_model_config, MainConfigBase)
            if main_model_config.variant is ModelVariantType.Inpaint:
                mask = dilated_mask_tensor
                vae_info: LoadedModel = context.models.load(self.vae.vae)
                image = context.images.get_pil(self.image.image_name)
                image_tensor = image_resized_to_grid_as_tensor(image.convert("RGB"))
                if image_tensor.dim() == 3:
                    image_tensor = image_tensor.unsqueeze(0)
                img_mask = tv_resize(mask, image_tensor.shape[-2:], T.InterpolationMode.BILINEAR, antialias=False)
                masked_image = image_tensor * torch.where(img_mask < 0.5, 0.0, 1.0)
                context.util.signal_progress("Running VAE encoder")
                masked_latents = ImageToLatentsInvocation.vae_encode(
                    vae_info, self.fp32, self.tiled, masked_image.clone()
                )
                masked_latents_name = context.tensors.save(tensor=masked_latents)

        return GradientMaskOutput(
            denoise_mask=DenoiseMaskField(mask_name=mask_name, masked_latents_name=masked_latents_name, gradient=True),
            expanded_mask_area=ImageField(image_name=expanded_image_dto.image_name),
        )

    def max_filter2D_torch(self, image: torch.Tensor, kernel: torch.Tensor) -> torch.Tensor:
        """
        This morphological operation is much faster in torch than numpy or opencv
        For reasonable kernel sizes, the overhead of copying the data to the GPU is not worth it.
        """
        h, w = kernel.shape
        pad_h, pad_w = h // 2, w // 2

        padded = torch.nn.functional.pad(image, (pad_w, pad_w, pad_h, pad_h), mode="constant", value=0)
        result = torch.zeros_like(image)

        # This looks like it's inside out, but it does the same thing and is more efficient
        for i in range(h):
            for j in range(w):
                weight = kernel[i, j]
                if weight <= 0:
                    continue

                # Extract the region from padded tensor
                region = padded[i : i + image.shape[0], j : j + image.shape[1]]

                # Apply weight and update max
                result = torch.maximum(result, region * weight)

        return result