Commented pytorch profiler.

Avoid unnecessary dtype conversions with rope encodings.
Use view() instead of rearrange() for better performance.
2026-01-16 09:28:13 -05:00 · 2024-11-24 04:58:01 +00:00 · 2024-11-24 04:48:44 +00:00 · 2024-11-24 04:31:02 +00:00 · 2024-11-24 04:17:52 +00:00
3 changed files with 9 additions and 12 deletions
--- a/invokeai/app/invocations/flux_denoise.py
+++ b/invokeai/app/invocations/flux_denoise.py
@@ -334,6 +334,8 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
                dtype=inference_dtype,
            )

+            # activities = [torch.profiler.ProfilerActivity.CPU, torch.profiler.ProfilerActivity.CUDA]
+            # with torch.profiler.profile(activities=activities, record_shapes=True, with_stack=True) as prof:
            x = denoise(
                model=transformer,
                img=x,
@@ -353,6 +355,7 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
                pos_ip_adapter_extensions=pos_ip_adapter_extensions,
                neg_ip_adapter_extensions=neg_ip_adapter_extensions,
            )
+            # prof.export_chrome_trace("trace.json")

        x = unpack(x.float(), self.height, self.width)
        return x
--- a/invokeai/backend/flux/math.py
+++ b/invokeai/backend/flux/math.py
@@ -16,20 +16,17 @@ def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:

 def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
    assert dim % 2 == 0
-    scale = (
-        torch.arange(0, dim, 2, dtype=torch.float32 if pos.device.type == "mps" else torch.float64, device=pos.device)
-        / dim
-    )
+    scale = torch.arange(0, dim, 2, dtype=pos.dtype, device=pos.device) / dim
    omega = 1.0 / (theta**scale)
    out = torch.einsum("...n,d->...nd", pos, omega)
    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
-    return out.float()
+    return out


 def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
-    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
-    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_ = xq.view(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.view(*xk.shape[:-1], -1, 1, 2)
    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
-    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)
+    return xq_out.view(*xq.shape), xk_out.view(*xk.shape)
--- a/invokeai/backend/flux/modules/layers.py
+++ b/invokeai/backend/flux/modules/layers.py
@@ -66,10 +66,7 @@ class RMSNorm(torch.nn.Module):
        self.scale = nn.Parameter(torch.ones(dim))

    def forward(self, x: Tensor):
-        x_dtype = x.dtype
-        x = x.float()
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms).to(dtype=x_dtype) * self.scale
+        return torch.nn.functional.rms_norm(x, self.scale.shape, self.scale, eps=1e-6)


 class QKNorm(torch.nn.Module):
Author	SHA1	Message	Date
Ryan Dick	182c5793ba	Commented pytorch profiler.	2024-11-24 04:58:01 +00:00
Ryan Dick	675a66612c	Avoid unnecessary dtype conversions with rope encodings.	2024-11-24 04:48:44 +00:00
Ryan Dick	abdf2a7f86	Use view() instead of rearrange() for better performance.	2024-11-24 04:31:02 +00:00
Ryan Dick	bb098ec064	Replace custom RMSNorm implementation with torch.nn.functional.rms_norm(...) for improved speed.	2024-11-24 04:17:52 +00:00