mirror of
https://github.com/invoke-ai/InvokeAI.git
synced 2026-01-14 12:57:57 -05:00
434 lines
17 KiB
Python
434 lines
17 KiB
Python
from typing import Any, Literal, Union
|
||
|
||
import cv2
|
||
import numpy as np
|
||
import torch
|
||
from einops import rearrange
|
||
from PIL import Image
|
||
|
||
from invokeai.backend.image_util.util import nms, normalize_image_channel_count
|
||
|
||
CONTROLNET_RESIZE_VALUES = Literal[
|
||
"just_resize",
|
||
"crop_resize",
|
||
"fill_resize",
|
||
"just_resize_simple",
|
||
]
|
||
CONTROLNET_MODE_VALUES = Literal["balanced", "more_prompt", "more_control", "unbalanced"]
|
||
|
||
###################################################################
|
||
# Copy of scripts/lvminthin.py from Mikubill/sd-webui-controlnet
|
||
###################################################################
|
||
# High Quality Edge Thinning using Pure Python
|
||
# Written by Lvmin Zhangu
|
||
# 2023 April
|
||
# Stanford University
|
||
# If you use this, please Cite "High Quality Edge Thinning using Pure Python", Lvmin Zhang, In Mikubill/sd-webui-controlnet.
|
||
|
||
lvmin_kernels_raw = [
|
||
np.array([[-1, -1, -1], [0, 1, 0], [1, 1, 1]], dtype=np.int32),
|
||
np.array([[0, -1, -1], [1, 1, -1], [0, 1, 0]], dtype=np.int32),
|
||
]
|
||
|
||
lvmin_kernels = []
|
||
lvmin_kernels += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_kernels_raw]
|
||
lvmin_kernels += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_kernels_raw]
|
||
lvmin_kernels += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_kernels_raw]
|
||
lvmin_kernels += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_kernels_raw]
|
||
|
||
lvmin_prunings_raw = [
|
||
np.array([[-1, -1, -1], [-1, 1, -1], [0, 0, -1]], dtype=np.int32),
|
||
np.array([[-1, -1, -1], [-1, 1, -1], [-1, 0, 0]], dtype=np.int32),
|
||
]
|
||
|
||
lvmin_prunings = []
|
||
lvmin_prunings += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_prunings_raw]
|
||
lvmin_prunings += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_prunings_raw]
|
||
lvmin_prunings += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_prunings_raw]
|
||
lvmin_prunings += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_prunings_raw]
|
||
|
||
|
||
def remove_pattern(x, kernel):
|
||
objects = cv2.morphologyEx(x, cv2.MORPH_HITMISS, kernel)
|
||
objects = np.where(objects > 127)
|
||
x[objects] = 0
|
||
return x, objects[0].shape[0] > 0
|
||
|
||
|
||
def thin_one_time(x, kernels):
|
||
y = x
|
||
is_done = True
|
||
for k in kernels:
|
||
y, has_update = remove_pattern(y, k)
|
||
if has_update:
|
||
is_done = False
|
||
return y, is_done
|
||
|
||
|
||
def lvmin_thin(x, prunings=True):
|
||
y = x
|
||
for _i in range(32):
|
||
y, is_done = thin_one_time(y, lvmin_kernels)
|
||
if is_done:
|
||
break
|
||
if prunings:
|
||
y, _ = thin_one_time(y, lvmin_prunings)
|
||
return y
|
||
|
||
|
||
################################################################################
|
||
# copied from Mikubill/sd-webui-controlnet external_code.py and modified for InvokeAI
|
||
################################################################################
|
||
# FIXME: not using yet, if used in the future will most likely require modification of preprocessors
|
||
def pixel_perfect_resolution(
|
||
image: np.ndarray,
|
||
target_H: int,
|
||
target_W: int,
|
||
resize_mode: str,
|
||
) -> int:
|
||
"""
|
||
Calculate the estimated resolution for resizing an image while preserving aspect ratio.
|
||
|
||
The function first calculates scaling factors for height and width of the image based on the target
|
||
height and width. Then, based on the chosen resize mode, it either takes the smaller or the larger
|
||
scaling factor to estimate the new resolution.
|
||
|
||
If the resize mode is OUTER_FIT, the function uses the smaller scaling factor, ensuring the whole image
|
||
fits within the target dimensions, potentially leaving some empty space.
|
||
|
||
If the resize mode is not OUTER_FIT, the function uses the larger scaling factor, ensuring the target
|
||
dimensions are fully filled, potentially cropping the image.
|
||
|
||
After calculating the estimated resolution, the function prints some debugging information.
|
||
|
||
Args:
|
||
image (np.ndarray): A 3D numpy array representing an image. The dimensions represent [height, width, channels].
|
||
target_H (int): The target height for the image.
|
||
target_W (int): The target width for the image.
|
||
resize_mode (ResizeMode): The mode for resizing.
|
||
|
||
Returns:
|
||
int: The estimated resolution after resizing.
|
||
"""
|
||
raw_H, raw_W, _ = image.shape
|
||
|
||
k0 = float(target_H) / float(raw_H)
|
||
k1 = float(target_W) / float(raw_W)
|
||
|
||
if resize_mode == "fill_resize":
|
||
estimation = min(k0, k1) * float(min(raw_H, raw_W))
|
||
else: # "crop_resize" or "just_resize" (or possibly "just_resize_simple"?)
|
||
estimation = max(k0, k1) * float(min(raw_H, raw_W))
|
||
|
||
# print(f"Pixel Perfect Computation:")
|
||
# print(f"resize_mode = {resize_mode}")
|
||
# print(f"raw_H = {raw_H}")
|
||
# print(f"raw_W = {raw_W}")
|
||
# print(f"target_H = {target_H}")
|
||
# print(f"target_W = {target_W}")
|
||
# print(f"estimation = {estimation}")
|
||
|
||
return int(np.round(estimation))
|
||
|
||
|
||
def clone_contiguous(x: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]:
|
||
"""Get a memory-contiguous clone of the given numpy array, as a safety measure and to improve computation efficiency."""
|
||
return np.ascontiguousarray(x).copy()
|
||
|
||
|
||
def np_img_to_torch(np_img: np.ndarray[Any, Any], device: torch.device) -> torch.Tensor:
|
||
"""Convert a numpy image to a PyTorch tensor. The image is normalized to 0-1, rearranged to BCHW format and sent to
|
||
the specified device."""
|
||
|
||
torch_img = torch.from_numpy(np_img)
|
||
normalized = torch_img.float() / 255.0
|
||
bchw = rearrange(normalized, "h w c -> 1 c h w")
|
||
on_device = bchw.to(device)
|
||
return on_device.clone()
|
||
|
||
|
||
def heuristic_resize(np_img: np.ndarray[Any, Any], size: tuple[int, int]) -> np.ndarray[Any, Any]:
|
||
"""Resizes an image using a heuristic to choose the best resizing strategy.
|
||
|
||
- If the image appears to be an edge map, special handling will be applied to ensure the edges are not distorted.
|
||
- Single-pixel edge maps use NMS and thinning to keep the edges as single-pixel lines.
|
||
- Low-color-count images are resized with nearest-neighbor to preserve color information (for e.g. segmentation maps).
|
||
- The alpha channel is handled separately to ensure it is resized correctly.
|
||
|
||
Args:
|
||
np_img (np.ndarray): The input image.
|
||
size (tuple[int, int]): The target size for the image.
|
||
|
||
Returns:
|
||
np.ndarray: The resized image.
|
||
|
||
Adapted from https://github.com/Mikubill/sd-webui-controlnet.
|
||
"""
|
||
|
||
# Return early if the image is already at the requested size
|
||
if np_img.shape[0] == size[1] and np_img.shape[1] == size[0]:
|
||
return np_img
|
||
|
||
# If the image has an alpha channel, separate it for special handling later.
|
||
inpaint_mask = None
|
||
if np_img.ndim == 3 and np_img.shape[2] == 4:
|
||
inpaint_mask = np_img[:, :, 3]
|
||
np_img = np_img[:, :, 0:3]
|
||
|
||
new_size_is_smaller = (size[0] * size[1]) < (np_img.shape[0] * np_img.shape[1])
|
||
new_size_is_bigger = (size[0] * size[1]) > (np_img.shape[0] * np_img.shape[1])
|
||
unique_color_count = np.unique(np_img.reshape(-1, np_img.shape[2]), axis=0).shape[0]
|
||
is_one_pixel_edge = False
|
||
is_binary = False
|
||
|
||
if unique_color_count == 2:
|
||
# If the image has only two colors, it is likely binary. Check if the image has one-pixel edges.
|
||
is_binary = np.min(np_img) < 16 and np.max(np_img) > 240
|
||
if is_binary:
|
||
eroded = cv2.erode(np_img, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
|
||
dilated = cv2.dilate(eroded, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1)
|
||
one_pixel_edge_count = np.where(dilated < np_img)[0].shape[0]
|
||
all_edge_count = np.where(np_img > 127)[0].shape[0]
|
||
is_one_pixel_edge = one_pixel_edge_count * 2 > all_edge_count
|
||
|
||
if 2 < unique_color_count < 200:
|
||
# With a low color count, we assume this is a map where exact colors are important. Near-neighbor preserves
|
||
# the colors as needed.
|
||
interpolation = cv2.INTER_NEAREST
|
||
elif new_size_is_smaller:
|
||
# This works best for downscaling
|
||
interpolation = cv2.INTER_AREA
|
||
else:
|
||
# Fall back for other cases
|
||
interpolation = cv2.INTER_CUBIC # Must be CUBIC because we now use nms. NEVER CHANGE THIS
|
||
|
||
# This may be further transformed depending on the binary nature of the image.
|
||
resized = cv2.resize(np_img, size, interpolation=interpolation)
|
||
|
||
if inpaint_mask is not None:
|
||
# Resize the inpaint mask to match the resized image using the same interpolation method.
|
||
inpaint_mask = cv2.resize(inpaint_mask, size, interpolation=interpolation)
|
||
|
||
# If the image is binary, we will perform some additional processing to ensure the edges are preserved.
|
||
if is_binary:
|
||
resized = np.mean(resized.astype(np.float32), axis=2).clip(0, 255).astype(np.uint8)
|
||
if is_one_pixel_edge:
|
||
# Use NMS and thinning to keep the edges as single-pixel lines.
|
||
resized = nms(resized)
|
||
_, resized = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
resized = lvmin_thin(resized, prunings=new_size_is_bigger)
|
||
else:
|
||
_, resized = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
resized = np.stack([resized] * 3, axis=2)
|
||
|
||
# Restore the alpha channel if it was present.
|
||
if inpaint_mask is not None:
|
||
inpaint_mask = (inpaint_mask > 127).astype(np.float32) * 255.0
|
||
inpaint_mask = inpaint_mask[:, :, None].clip(0, 255).astype(np.uint8)
|
||
resized = np.concatenate([resized, inpaint_mask], axis=2)
|
||
|
||
return resized
|
||
|
||
|
||
# precompute common kernels
|
||
_KERNEL3 = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
|
||
# directional masks for NMS
|
||
_DIRS = [
|
||
np.array([[0, 0, 0], [1, 1, 1], [0, 0, 0]], np.uint8),
|
||
np.array([[0, 1, 0], [0, 1, 0], [0, 1, 0]], np.uint8),
|
||
np.array([[1, 0, 0], [0, 1, 0], [0, 0, 1]], np.uint8),
|
||
np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0]], np.uint8),
|
||
]
|
||
|
||
|
||
def heuristic_resize_fast(np_img: np.ndarray, size: tuple[int, int]) -> np.ndarray:
|
||
h, w = np_img.shape[:2]
|
||
# early exit
|
||
if (w, h) == size:
|
||
return np_img
|
||
|
||
# separate alpha channel
|
||
img = np_img
|
||
alpha = None
|
||
if img.ndim == 3 and img.shape[2] == 4:
|
||
alpha, img = img[:, :, 3], img[:, :, :3]
|
||
|
||
# build small sample for unique‐color & binary detection
|
||
flat = img.reshape(-1, img.shape[-1])
|
||
N = flat.shape[0]
|
||
# include four corners to avoid missing extreme values
|
||
corners = np.vstack([img[0, 0], img[0, w - 1], img[h - 1, 0], img[h - 1, w - 1]])
|
||
cnt = min(N, 100_000)
|
||
samp = np.vstack([corners, flat[np.random.choice(N, cnt, replace=False)]])
|
||
uc = np.unique(samp, axis=0).shape[0]
|
||
vmin, vmax = samp.min(), samp.max()
|
||
|
||
# detect binary edge map & one‐pixel‐edge case
|
||
is_binary = uc == 2 and vmin < 16 and vmax > 240
|
||
one_pixel_edge = False
|
||
if is_binary:
|
||
# single gray conversion
|
||
gray0 = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
||
grad = cv2.morphologyEx(gray0, cv2.MORPH_GRADIENT, _KERNEL3)
|
||
cnt_edge = cv2.countNonZero(grad)
|
||
cnt_all = cv2.countNonZero((gray0 > 127).astype(np.uint8))
|
||
one_pixel_edge = (2 * cnt_edge) > cnt_all
|
||
|
||
# choose interp for color/seg/grayscale
|
||
area_new, area_old = size[0] * size[1], w * h
|
||
if 2 < uc < 200: # segmentation map
|
||
interp = cv2.INTER_NEAREST
|
||
elif area_new < area_old:
|
||
interp = cv2.INTER_AREA
|
||
else:
|
||
interp = cv2.INTER_CUBIC
|
||
|
||
# single resize pass on RGB
|
||
resized = cv2.resize(img, size, interpolation=interp)
|
||
|
||
if is_binary:
|
||
# convert to gray & apply NMS via C++ dilate
|
||
gray_r = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
|
||
nms = np.zeros_like(gray_r)
|
||
for K in _DIRS:
|
||
d = cv2.dilate(gray_r, K)
|
||
mask = d == gray_r
|
||
nms[mask] = gray_r[mask]
|
||
|
||
# threshold + thinning if needed
|
||
_, bw = cv2.threshold(nms, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
||
out_bin = cv2.ximgproc.thinning(bw) if one_pixel_edge else bw
|
||
# restore 3 channels
|
||
resized = np.stack([out_bin] * 3, axis=2)
|
||
|
||
# restore alpha with same interp as RGB for consistency
|
||
if alpha is not None:
|
||
am = cv2.resize(alpha, size, interpolation=interp)
|
||
am = (am > 127).astype(np.uint8) * 255
|
||
resized = np.dstack((resized, am))
|
||
|
||
return resized
|
||
|
||
|
||
###########################################################################
|
||
# Copied from detectmap_proc method in scripts/detectmap_proc.py in Mikubill/sd-webui-controlnet
|
||
# modified for InvokeAI
|
||
###########################################################################
|
||
def np_img_resize(
|
||
np_img: np.ndarray,
|
||
resize_mode: CONTROLNET_RESIZE_VALUES,
|
||
h: int,
|
||
w: int,
|
||
device: torch.device = torch.device("cpu"),
|
||
) -> tuple[torch.Tensor, np.ndarray[Any, Any]]:
|
||
np_img = normalize_image_channel_count(np_img)
|
||
|
||
if resize_mode == "just_resize": # RESIZE
|
||
np_img = heuristic_resize_fast(np_img, (w, h))
|
||
np_img = clone_contiguous(np_img)
|
||
return np_img_to_torch(np_img, device), np_img
|
||
|
||
old_h, old_w, _ = np_img.shape
|
||
old_w = float(old_w)
|
||
old_h = float(old_h)
|
||
k0 = float(h) / old_h
|
||
k1 = float(w) / old_w
|
||
|
||
def safeint(x: Union[int, float]) -> int:
|
||
return int(np.round(x))
|
||
|
||
if resize_mode == "fill_resize": # OUTER_FIT
|
||
k = min(k0, k1)
|
||
borders = np.concatenate([np_img[0, :, :], np_img[-1, :, :], np_img[:, 0, :], np_img[:, -1, :]], axis=0)
|
||
high_quality_border_color = np.median(borders, axis=0).astype(np_img.dtype)
|
||
if len(high_quality_border_color) == 4:
|
||
# Inpaint hijack
|
||
high_quality_border_color[3] = 255
|
||
high_quality_background = np.tile(high_quality_border_color[None, None], [h, w, 1])
|
||
np_img = heuristic_resize_fast(np_img, (safeint(old_w * k), safeint(old_h * k)))
|
||
new_h, new_w, _ = np_img.shape
|
||
pad_h = max(0, (h - new_h) // 2)
|
||
pad_w = max(0, (w - new_w) // 2)
|
||
high_quality_background[pad_h : pad_h + new_h, pad_w : pad_w + new_w] = np_img
|
||
np_img = high_quality_background
|
||
np_img = clone_contiguous(np_img)
|
||
return np_img_to_torch(np_img, device), np_img
|
||
else: # resize_mode == "crop_resize" (INNER_FIT)
|
||
k = max(k0, k1)
|
||
np_img = heuristic_resize_fast(np_img, (safeint(old_w * k), safeint(old_h * k)))
|
||
new_h, new_w, _ = np_img.shape
|
||
pad_h = max(0, (new_h - h) // 2)
|
||
pad_w = max(0, (new_w - w) // 2)
|
||
np_img = np_img[pad_h : pad_h + h, pad_w : pad_w + w]
|
||
np_img = clone_contiguous(np_img)
|
||
return np_img_to_torch(np_img, device), np_img
|
||
|
||
|
||
def prepare_control_image(
|
||
image: Image.Image,
|
||
width: int,
|
||
height: int,
|
||
num_channels: int = 3,
|
||
device: str | torch.device = "cuda",
|
||
dtype: torch.dtype = torch.float16,
|
||
control_mode: CONTROLNET_MODE_VALUES = "balanced",
|
||
resize_mode: CONTROLNET_RESIZE_VALUES = "just_resize_simple",
|
||
do_classifier_free_guidance: bool = True,
|
||
) -> torch.Tensor:
|
||
"""Pre-process images for ControlNets or T2I-Adapters.
|
||
|
||
Args:
|
||
image (Image): The PIL image to pre-process.
|
||
width (int): The target width in pixels.
|
||
height (int): The target height in pixels.
|
||
num_channels (int, optional): The target number of image channels. This is achieved by converting the input
|
||
image to RGB, then naively taking the first `num_channels` channels. The primary use case is converting a
|
||
RGB image to a single-channel grayscale image. Raises if `num_channels` cannot be achieved. Defaults to 3.
|
||
device (str | torch.Device, optional): The target device for the output image. Defaults to "cuda".
|
||
dtype (_type_, optional): The dtype for the output image. Defaults to torch.float16.
|
||
do_classifier_free_guidance (bool, optional): If True, repeat the output image along the batch dimension.
|
||
Defaults to True.
|
||
control_mode (str, optional): Defaults to "balanced".
|
||
resize_mode (str, optional): Defaults to "just_resize_simple".
|
||
|
||
Raises:
|
||
ValueError: If `resize_mode` is not recognized.
|
||
ValueError: If `num_channels` is out of range.
|
||
|
||
Returns:
|
||
torch.Tensor: The pre-processed input tensor.
|
||
"""
|
||
if resize_mode == "just_resize_simple":
|
||
image = image.convert("RGB")
|
||
image = image.resize((width, height), resample=Image.LANCZOS)
|
||
nimage = np.array(image)
|
||
nimage = nimage[None, :]
|
||
nimage = np.concatenate([nimage], axis=0)
|
||
# normalizing RGB values to [0,1] range (in PIL.Image they are [0-255])
|
||
nimage = np.array(nimage).astype(np.float32) / 255.0
|
||
nimage = nimage.transpose(0, 3, 1, 2)
|
||
timage = torch.from_numpy(nimage)
|
||
|
||
# use fancy lvmin controlnet resizing
|
||
elif resize_mode == "just_resize" or resize_mode == "crop_resize" or resize_mode == "fill_resize":
|
||
nimage = np.array(image)
|
||
timage, nimage = np_img_resize(
|
||
np_img=nimage,
|
||
resize_mode=resize_mode,
|
||
h=height,
|
||
w=width,
|
||
device=torch.device(device),
|
||
)
|
||
else:
|
||
raise ValueError(f"Unsupported resize_mode: '{resize_mode}'.")
|
||
|
||
if timage.shape[1] < num_channels or num_channels <= 0:
|
||
raise ValueError(f"Cannot achieve the target of num_channels={num_channels}.")
|
||
timage = timage[:, :num_channels, :, :]
|
||
|
||
timage = timage.to(device=device, dtype=dtype)
|
||
cfg_injection = control_mode == "more_control" or control_mode == "unbalanced"
|
||
if do_classifier_free_guidance and not cfg_injection:
|
||
timage = torch.cat([timage] * 2)
|
||
return timage
|