From e878ca25937953089448053470d7aed7a8abd614 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Tue, 26 May 2026 10:07:58 +1000
Subject: [PATCH 01/11] memory_management: Add direct to read GPU mode

Make destination optional (or make it optionally GPU) and use aimdo
to file_read direct to GPU.
---
 comfy/memory_management.py | 37 ++++++++++++++++++++++++-------------
 requirements.txt           |  2 +-
 2 files changed, 25 insertions(+), 14 deletions(-)

diff --git a/comfy/memory_management.py b/comfy/memory_management.py
index 962addb27bf9..98985b8398c2 100644
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -4,6 +4,7 @@
 import torch
 from typing import NamedTuple
 
+import comfy_aimdo.host_buffer
 from comfy.quant_ops import QuantizedTensor
 
 
@@ -17,21 +18,18 @@ class TensorFileSlice(NamedTuple):
 def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None):
 
     if isinstance(tensor, QuantizedTensor):
-        if not isinstance(destination, QuantizedTensor):
-            return False
-        if tensor._layout_cls != destination._layout_cls:
-            return False
-
-        if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream,
+        if not read_tensor_file_slice_into(tensor._qdata,
+                                           destination._qdata if destination is not None else None, stream=stream,
                                            destination2=(destination2._qdata if destination2 is not None else None)):
             return False
 
-        dst_orig_dtype = destination._params.orig_dtype
-        destination._params.copy_from(tensor._params, non_blocking=False)
-        destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
+        if destination is not None:
+            dst_orig_dtype = destination._params.orig_dtype
+            destination._params.copy_from(tensor._params, non_blocking=False)
+            destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype)
         if destination2 is not None:
             dst_orig_dtype = destination2._params.orig_dtype
-            destination2._params.copy_from(destination._params, non_blocking=True)
+            destination2._params.copy_from(destination._params if destination is not None else tensor._params, non_blocking=True)
             destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype)
         return True
 
@@ -39,10 +37,15 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N
     if info is None:
         return False
 
+    if destination is not None and destination.device.type != "cpu" and destination2 is None:
+        destination2 = destination
+        destination = None
+
     file_obj = info.file_ref
-    if (destination.device.type != "cpu"
-            or file_obj is None
-            or destination.numel() * destination.element_size() < info.size
+    if (file_obj is None
+            or (destination is None and destination2 is None)
+            or (destination is not None and (destination.device.type != "cpu" or destination.numel() * destination.element_size() < info.size))
+            or (destination2 is not None and (destination2.device.type == "cpu" or destination2.numel() * destination2.element_size() < info.size))
             or tensor.numel() * tensor.element_size() != info.size
             or tensor.storage_offset() != 0
             or not tensor.is_contiguous()):
@@ -51,6 +54,14 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N
     if info.size == 0:
         return True
 
+    if destination is None:
+        stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
+        comfy_aimdo.host_buffer.read_file_to_device(file_obj, info.offset, info.size,
+                                                    stream_ptr, destination2.data_ptr(),
+                                                    destination2.device.index,
+                                                    mark_cold=False)
+        return True
+
     hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None)
     if hostbuf is not None:
         stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0
diff --git a/requirements.txt b/requirements.txt
index 0617667e1ba5..0647cd23e252 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=16.0.0
 comfy-kitchen==0.2.9
-comfy-aimdo==0.4.5
+comfy-aimdo==0.4.6
 requests
 simpleeval>=1.0.0
 blake3

From b1200a6170c00a80c5d699b6d1fdbe92270ebe84 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Tue, 26 May 2026 10:09:30 +1000
Subject: [PATCH 02/11] ops: Remove stream pin buffers and use aimdo reads

This consumed too much RAM and its better to just take the hit on
the CPU syncing back the stream on a short ring buffer. Aimdo
implements this so just rip the stream pin buffer from comfy.
---
 comfy/model_management.py | 44 +++-----------------------
 comfy/ops.py              | 66 +++++++--------------------------------
 2 files changed, 16 insertions(+), 94 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index b01c4d7fad21..f4fb0f33db84 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1283,7 +1283,6 @@ def current_stream(device):
 LARGEST_CASTED_WEIGHT = (None, 0)
 STREAM_AIMDO_CAST_BUFFERS = {}
 LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-STREAM_PIN_BUFFERS = {}
 
 DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3
 
@@ -1326,42 +1325,13 @@ def get_aimdo_cast_buffer(offload_stream, device):
         STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer
     return cast_buffer
 
-def get_pin_buffer(offload_stream):
-    pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None)
-    if pin_buffer is None:
-        pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0, 0, pinned_hostbuf_size(8 * 1024**3), mark_cold=False)
-        STREAM_PIN_BUFFERS[offload_stream] = pin_buffer
-    elif offload_stream is not None:
-        event = getattr(pin_buffer, "_comfy_event", None)
-        if event is not None:
-            event.synchronize()
-            delattr(pin_buffer, "_comfy_event")
-    return pin_buffer
-
-def resize_pin_buffer(pin_buffer, size):
-    global TOTAL_PINNED_MEMORY
-    old_size = pin_buffer.size
-    if size <= old_size:
-        return True
-    growth = size - old_size
-    comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
-    ensure_pin_budget(growth, evict_active=True)
-    ensure_pin_registerable(growth, evict_active=True)
-    try:
-        pin_buffer.extend(size=size, reallocate=True)
-    except RuntimeError:
-        return False
-    TOTAL_PINNED_MEMORY += pin_buffer.size - old_size
-    return True
-
 def reset_cast_buffers():
-    global TOTAL_PINNED_MEMORY
     global LARGEST_CASTED_WEIGHT
     global LARGEST_AIMDO_CASTED_WEIGHT
 
     LARGEST_CASTED_WEIGHT = (None, 0)
     LARGEST_AIMDO_CASTED_WEIGHT = (None, 0)
-    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS):
+    for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS):
         if offload_stream is not None:
             offload_stream.synchronize()
     synchronize()
@@ -1370,10 +1340,6 @@ def reset_cast_buffers():
         mmap_obj.bounce()
     DIRTY_MMAPS.clear()
 
-    for pin_buffer in STREAM_PIN_BUFFERS.values():
-        TOTAL_PINNED_MEMORY -= pin_buffer.size
-    TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY)
-
     for loaded_model in current_loaded_models:
         model = loaded_model.model
         if model is not None and model.is_dynamic():
@@ -1383,7 +1349,6 @@ def reset_cast_buffers():
 
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
-    STREAM_PIN_BUFFERS.clear()
     soft_empty_cache()
 
 def get_offload_stream(device):
@@ -1436,7 +1401,7 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None):
        if hasattr(wf_context, "as_context"):
            wf_context = wf_context.as_context(stream)
 
-    dest_views = comfy.memory_management.interpret_gathered_like(tensors, r)
+    dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) if r is not None else [None] * len(tensors)
     dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None
     with wf_context:
         for tensor in tensors:
@@ -1448,9 +1413,10 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None):
                 continue
             storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage()
             mark_mmap_dirty(storage)
-            dest_view.copy_(tensor, non_blocking=non_blocking)
+            if dest_view is not None:
+                dest_view.copy_(tensor, non_blocking=non_blocking)
             if dest2_view is not None:
-                dest2_view.copy_(dest_view, non_blocking=non_blocking)
+                dest2_view.copy_(tensor if dest_view is None else dest_view, non_blocking=non_blocking)
 
 
 def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None):
diff --git a/comfy/ops.py b/comfy/ops.py
index 56445be8d4c2..119177c374f9 100644
--- a/comfy/ops.py
+++ b/comfy/ops.py
@@ -76,8 +76,6 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs):
 
 cast_to = comfy.model_management.cast_to #TODO: remove once no more references
 
-STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024
-
 def cast_to_input(weight, input, non_blocking=False, copy=True):
     return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy)
 
@@ -94,9 +92,6 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin
     offload_stream = None
     cast_buffer = None
     cast_buffer_offset = 0
-    stream_pin_hostbuf = None
-    stream_pin_offset = 0
-    stream_pin_queue = []
 
     def ensure_offload_stream(module, required_size, check_largest):
         nonlocal offload_stream
@@ -130,22 +125,6 @@ def get_cast_buffer(buffer_size):
         cast_buffer_offset += buffer_size
         return buffer
 
-    def get_stream_pin_buffer_offset(buffer_size):
-        nonlocal stream_pin_hostbuf
-        nonlocal stream_pin_offset
-
-        if buffer_size == 0 or offload_stream is None:
-            return None
-
-        if stream_pin_hostbuf is None:
-            stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream)
-            if stream_pin_hostbuf is None:
-                return None
-
-        offset = stream_pin_offset
-        stream_pin_offset += buffer_size
-        return offset
-
     for s in comfy_modules:
         signature = comfy_aimdo.model_vbar.vbar_fault(s._v)
         resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature)
@@ -184,12 +163,18 @@ def get_stream_pin_buffer_offset(buffer_size):
         if xfer_dest is None:
             xfer_dest = get_cast_buffer(dest_size)
 
-        def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream):
+        def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream, xfer_dest2=None):
             if xfer_source is not None:
                 if getattr(xfer_source, "is_lowvram_patch", False):
-                    xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
-                else:
-                    comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream)
+                    if xfer_dest is not None:
+                        xfer_source.prepare(xfer_dest, stream, copy=True, commit=False)
+                        xfer_source = [ xfer_dest ]
+                        xfer_dest = xfer_dest2
+                        xfer_dest2 = None
+                    elif xfer_dest2 is not None:
+                        xfer_source.prepare(xfer_dest2, stream, copy=True, commit=False)
+                        return
+                comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream, r2=xfer_dest2)
 
         def handle_pin(m, pin, source, dest, subset="weights", size=None):
             if pin is not None:
@@ -198,19 +183,7 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None):
             if signature is None:
                 comfy.pinned_memory.pin_memory(m, subset=subset, size=size)
                 pin = comfy.pinned_memory.get_pin(m, subset=subset)
-                if pin is not None:
-                    if isinstance(source, list):
-                        comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest)
-                    else:
-                        cast_maybe_lowvram_patch(source, pin, None)
-                        cast_maybe_lowvram_patch([ pin ], dest, offload_stream)
-                    return
-            if pin is None:
-                pin_offset = get_stream_pin_buffer_offset(size)
-                if pin_offset is not None:
-                    stream_pin_queue.append((source, pin_offset, size, dest))
-                    return
-            cast_maybe_lowvram_patch(source, dest, offload_stream)
+            cast_maybe_lowvram_patch(source, pin, offload_stream, xfer_dest2=dest)
 
         handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size)
 
@@ -232,23 +205,6 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None):
         prefetch["needs_cast"] = needs_cast
         s._prefetch = prefetch
 
-    if stream_pin_offset > 0:
-        if stream_pin_hostbuf.size < stream_pin_offset:
-            if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM):
-                for xfer_source, _, _, xfer_dest in stream_pin_queue:
-                    cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream)
-                return offload_stream
-        stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf)
-        stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf
-        for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue:
-            pin = stream_pin_tensor[pin_offset:pin_offset + pin_size]
-            if isinstance(xfer_source, list):
-                comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest)
-            else:
-                cast_maybe_lowvram_patch(xfer_source, pin, None)
-                comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream)
-        stream_pin_hostbuf._comfy_event = offload_stream.record_event()
-
     return offload_stream
 
 

From 8955e1568ee61c3b7c32d28ef1dbe796ec40cc0d Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Tue, 26 May 2026 14:14:58 +1000
Subject: [PATCH 03/11] model_management: all active pin registration movement

Its better to just let the active model load past the pin limit as
pins and let the pins move around. The saves the HDD and SATA
people disk traffic while only costing a few GPU syncs.
---
 comfy/model_management.py | 11 +++++++++--
 comfy/pinned_memory.py    |  2 +-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index f4fb0f33db84..c8c27c162487 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -648,7 +648,7 @@ def ensure_pin_budget(size, evict_active=False):
     to_free = shortfall + PIN_PRESSURE_HYSTERESIS
     return free_pins(to_free, evict_active=evict_active) >= shortfall
 
-def ensure_pin_registerable(size, evict_active=False):
+def ensure_pin_registerable(size, evict_active=True):
     shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
     if MAX_PINNED_MEMORY <= 0:
         return False
@@ -658,10 +658,17 @@ def ensure_pin_registerable(size, evict_active=False):
     shortfall += REGISTERABLE_PIN_HYSTERESIS
     for loaded_model in reversed(current_loaded_models):
         model = loaded_model.model
-        if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]):
+        if model is not None and model.is_dynamic() and not model.model.dynamic_pins[model.load_device]["active"]:
             shortfall -= model.unregister_inactive_pins(shortfall)
             if shortfall <= 0:
                 return True
+    if evict_active:
+        for loaded_model in current_loaded_models:
+            model = loaded_model.model
+            if model is not None and model.is_dynamic() and model.model.dynamic_pins[model.load_device]["active"]:
+                shortfall -= model.unregister_inactive_pins(shortfall)
+                if shortfall <= 0:
+                    return True
     return shortfall <= REGISTERABLE_PIN_HYSTERESIS
 
 class LoadedModel:
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index 0e8f573ba0ee..d588cc6a01cd 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -38,7 +38,7 @@ def pin_memory(module, subset="weights", size=None):
     if size is None:
         size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
     offset = hostbuf.size
-    registerable_size = size + max(0, hostbuf.size - pinned_size[0])
+    registerable_size = size
 
     comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
     if (not comfy.model_management.ensure_pin_budget(size) or

From 1a7a09eb66d22af6848ebaf6a3580a5627f392ef Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Wed, 27 May 2026 00:36:35 +1000
Subject: [PATCH 04/11] utils: use aimdo file handle

This opens on windows with more favourable flags
---
 comfy/memory_management.py | 3 +++
 comfy/utils.py             | 2 +-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/comfy/memory_management.py b/comfy/memory_management.py
index 98985b8398c2..e032b7dcdae5 100644
--- a/comfy/memory_management.py
+++ b/comfy/memory_management.py
@@ -74,6 +74,9 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N
                                     device=None if destination2 is None else destination2.device.index)
         return True
 
+    if not hasattr(file_obj, "seek") or not hasattr(file_obj, "readinto"):
+        return False
+
     buf_type = ctypes.c_ubyte * info.size
     view = memoryview(buf_type.from_address(destination.data_ptr()))
 
diff --git a/comfy/utils.py b/comfy/utils.py
index 49ae12b0660c..09c9e458c986 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -85,9 +85,9 @@ def encode(*args, **kwargs):  # no longer necessary on newer torch
 def load_safetensors(ckpt):
     import comfy_aimdo.model_mmap
 
-    f = open(ckpt, "rb", buffering=0)
     file_lock = threading.Lock()
     model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt)
+    f = model_mmap.get_file_handle()
     file_size = os.path.getsize(ckpt)
     mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get()))
 

From 30e28b84de3a31b716b5ef8819f372f6d56196ec Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 28 May 2026 10:15:30 +1000
Subject: [PATCH 05/11] mp: only count the model proper for loaded_ram and vram

Exclude live loras from the numbers to avoid the case where the reported
loaded memory exceeds the size of the model.

This causes me confusion in the Kijai visualizer when it looked fully
loaded but was hitting disk due to this accounding disrepency.
---
 comfy/model_patcher.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 00a15fa63b0c..54cc5933bba7 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1942,12 +1942,10 @@ def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=Fals
         return freed
 
     def loaded_ram_size(self):
-        return (self.model.dynamic_pins[self.load_device]["weights"][0].size +
-                self.model.dynamic_pins[self.load_device]["patches"][0].size)
+        return (self.model.dynamic_pins[self.load_device]["weights"][0].size)
 
     def pinned_memory_size(self):
-        return (self.model.dynamic_pins[self.load_device]["weights"][3][0] +
-                self.model.dynamic_pins[self.load_device]["patches"][3][0])
+        return (self.model.dynamic_pins[self.load_device]["weights"][3][0])
 
     def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]):
         freed = 0

From abcb75e97f8730faa226d6be03786b806e1f7d02 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 28 May 2026 18:08:04 +1000
Subject: [PATCH 06/11] utils: add bit reverse utility

useful for max scattering something ordered.
---
 comfy/utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/comfy/utils.py b/comfy/utils.py
index 09c9e458c986..09d783ffff91 100644
--- a/comfy/utils.py
+++ b/comfy/utils.py
@@ -1452,3 +1452,10 @@ def deepcopy_list_dict(obj, memo=None):
 
     memo[obj_id] = res
     return res
+
+def bit_reverse_range(index, bits):
+    result = 0
+    for _ in range(bits):
+        result = (result << 1) | (index & 1)
+        index >>= 1
+    return result

From 015346118cebbcd417e8dd834b628191c2411ba0 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 28 May 2026 18:21:58 +1000
Subject: [PATCH 07/11] pinned_memory: Implement offload balancing

Use a max scatter alogorithm to prioritize pins of the same size such
that when doing a little bit of offloading it gets scattered, allowing
the prefetcher to more evenly swollow the offload.
---
 comfy/model_management.py | 13 +++++++--
 comfy/model_patcher.py    | 14 +++++-----
 comfy/pinned_memory.py    | 57 ++++++++++++++++++++++++++++++++++-----
 3 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index c8c27c162487..19a47e32e6bb 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -1350,9 +1350,18 @@ def reset_cast_buffers():
     for loaded_model in current_loaded_models:
         model = loaded_model.model
         if model is not None and model.is_dynamic():
-            model.model.dynamic_pins[model.load_device]["active"] = False
+            pin_state = model.model.dynamic_pins[model.load_device]
+
+            if pin_state["active"]:
+                *_, buckets = pin_state["weights"]
+                for size, bucket in list(buckets.items()):
+                    bucket[:] = [ entry for entry in bucket if entry[-1] is not None ]
+                    if not bucket:
+                        del buckets[size]
+
+            pin_state["active"] = False
             model.partially_unload_ram(1e30, subsets=[ "patches" ])
-            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0])
+            model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0], [0], {})
 
     STREAM_CAST_BUFFERS.clear()
     STREAM_AIMDO_CAST_BUFFERS.clear()
diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py
index 54cc5933bba7..b716a69e223a 100644
--- a/comfy/model_patcher.py
+++ b/comfy/model_patcher.py
@@ -1721,8 +1721,8 @@ def register_load_device(self, device):
         """
         if device not in self.model.dynamic_pins:
             self.model.dynamic_pins[device] = {
-                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]),
-                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]),
+                "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0], [0], {}),
+                "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0], [0], {}),
                 "hostbufs_initialized": False,
                 "failed": False,
                 "active": False,
@@ -1799,8 +1799,8 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False
             pin_state = self.model.dynamic_pins[self.load_device]
             if not pin_state["hostbufs_initialized"]:
                 hostbuf_size = comfy.model_management.pinned_hostbuf_size(self.model_size())
-                pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0])
-                pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0])
+                pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0], [0], {})
+                pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0], [0], {})
                 pin_state["hostbufs_initialized"] = True
             pin_state["failed"] = False
             pin_state["active"] = True
@@ -1951,7 +1951,7 @@ def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches"
         freed = 0
         pin_state = self.model.dynamic_pins[self.load_device]
         for subset in subsets:
-            hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+            hostbuf, stack, stack_split, pinned_size, *_ = pin_state[subset]
             split = stack_split[0]
             while split >= 0:
                 module, offset = stack[split]
@@ -1976,10 +1976,12 @@ def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]):
         freed = 0
         pin_state = self.model.dynamic_pins[self.load_device]
         for subset in subsets:
-            hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+            hostbuf, stack, stack_split, pinned_size, *_ = pin_state[subset]
             while len(stack) > 0:
                 module, offset = stack.pop()
                 size = module._pin.numel() * module._pin.element_size()
+                module._pin_balancer_entry[-1] = None
+                del module._pin_balancer_entry
                 del module._pin
                 hostbuf.truncate(offset, do_unregister=module._pin_registered)
                 stack_split[0] = min(stack_split[0], len(stack) - 1)
diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py
index d588cc6a01cd..ffe12e0dc0e9 100644
--- a/comfy/pinned_memory.py
+++ b/comfy/pinned_memory.py
@@ -1,17 +1,55 @@
+import bisect
+
 import comfy.model_management
 import comfy.memory_management
+import comfy.utils
 import comfy_aimdo.host_buffer
 import comfy_aimdo.torch
 import torch
 
 from comfy.cli_args import args
 
+def _add_to_bucket(module, buckets, size, priority):
+    bucket = buckets.setdefault(size, [])
+    entry = [-priority, 0, module]
+    entry[1] = id(entry)
+    bisect.insort(bucket, entry)
+    module._pin_balancer_entry = entry
+
+def _steal_pin(module, stack, buckets, size, priority):
+    bucket = buckets.get(size)
+    if bucket is None:
+        return False
+
+    while bucket and bucket[-1][-1] is None:
+        bucket.pop()
+    if not bucket:
+        del buckets[size]
+        return False
+
+    if priority <= -bucket[-1][0]:
+        return False
+
+    *_, victim = bucket.pop()
+    module._pin = victim._pin
+    module._pin_registered = victim._pin_registered
+    module._pin_stack_index = victim._pin_stack_index
+    stack[module._pin_stack_index] = (module, stack[module._pin_stack_index][1])
+
+    victim._pin_registered = False
+    del victim._pin
+    del victim._pin_stack_index
+    del victim._pin_balancer_entry
+
+    _add_to_bucket(module, buckets, size, priority)
+    return True
+
 def get_pin(module, subset="weights"):
     pin = getattr(module, "_pin", None)
     if pin is None or module._pin_registered or args.disable_pinned_memory:
         return pin
 
-    _, _, stack_split, pinned_size = module._pin_state[subset]
+    _, _, stack_split, pinned_size, *_ = module._pin_state[subset]
     size = pin.nbytes
     comfy.model_management.ensure_pin_registerable(size)
 
@@ -31,26 +69,30 @@ def pin_memory(module, subset="weights", size=None):
         return
 
     pin = get_pin(module, subset)
-    if pin is not None or pin_state["failed"]:
+    if pin is not None:
         return
 
-    hostbuf, stack, stack_split, pinned_size = pin_state[subset]
+    hostbuf, stack, stack_split, pinned_size, counter, buckets = pin_state[subset]
     if size is None:
         size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ])
     offset = hostbuf.size
     registerable_size = size
+    priority = getattr(module, "_pin_balancer_priority", None)
+
+    if priority is None:
+        priority = comfy.utils.bit_reverse_range(counter[0], 16)
+        counter[0] += 1
+        module._pin_balancer_priority = priority
 
     comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM)
     if (not comfy.model_management.ensure_pin_budget(size) or
         not comfy.model_management.ensure_pin_registerable(registerable_size)):
-        pin_state["failed"] = True
-        return False
+        return _steal_pin(module, stack, buckets, size, priority)
 
     try:
         hostbuf.extend(size=size)
     except RuntimeError:
-        pin_state["failed"] = True
-        return False
+        return _steal_pin(module, stack, buckets, size, priority)
 
     module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size]
     module._pin.untyped_storage()._comfy_hostbuf = hostbuf
@@ -60,4 +102,5 @@ def pin_memory(module, subset="weights", size=None):
     stack_split[0] = max(stack_split[0], module._pin_stack_index)
     comfy.model_management.TOTAL_PINNED_MEMORY += size
     pinned_size[0] += size
+    _add_to_bucket(module, buckets, size, priority)
     return True

From 4f24436586ecebc9b9a679d9bac4e557cba2ebae Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Thu, 28 May 2026 18:23:32 +1000
Subject: [PATCH 08/11] comfy-aimdo 0.4.7

Aimdo 0.4.7 implement VRAM buffer exhaustion predection to avoid
early speculative load of weights that definately wont fix once the
inference gets further in.
---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 0647cd23e252..a42e3905812c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0
 filelock
 av>=16.0.0
 comfy-kitchen==0.2.9
-comfy-aimdo==0.4.6
+comfy-aimdo==0.4.7
 requests
 simpleeval>=1.0.0
 blake3

From 027b21575153605c304dcb70c547def06206f4f0 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Sat, 30 May 2026 03:38:10 +1000
Subject: [PATCH 09/11] model-prefetch: consolidate pin ensures on the sync
 point

This could happen mid prefetch block, cause a sync of the entire
block and lose overlap. Get ahead of the problem with a free down
at the natural compute stream sync point.
---
 comfy/model_prefetch.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/comfy/model_prefetch.py b/comfy/model_prefetch.py
index 72e11dec6a1a..27d852268093 100644
--- a/comfy/model_prefetch.py
+++ b/comfy/model_prefetch.py
@@ -1,4 +1,5 @@
 import comfy_aimdo.model_vbar
+import comfy.memory_management
 import comfy.model_management
 import comfy.ops
 
@@ -50,7 +51,16 @@ def prefetch_queue_pop(queue, device, module):
             if hasattr(s, "_v"):
                 comfy_modules.append(s)
 
+        registerable_size = 0
+        for s in comfy_modules:
+            registerable_size += comfy.memory_management.vram_aligned_size([s.weight, s.bias])
+            for param_key in ("weight", "bias"):
+                lowvram_fn = getattr(s, param_key + "_lowvram_function", None)
+                if lowvram_fn is not None:
+                    registerable_size += lowvram_fn.memory_required()
+
         offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True)
+        comfy.model_management.ensure_pin_registerable(registerable_size)
         comfy.model_management.sync_stream(device, offload_stream)
         queue[0] = (offload_stream, (prefetch, comfy_modules))
 

From 8f28048406ac0705051eaa0ba51a943204909dc4 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Sat, 30 May 2026 03:42:20 +1000
Subject: [PATCH 10/11] mm: Put a 2GB min on the pin ceiling

This is reasonably bad if it starts causing swap pressure, moreso than
during normal ram-cache proceedings. Clamp it.
---
 comfy/model_management.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/comfy/model_management.py b/comfy/model_management.py
index 19a47e32e6bb..013fcf862d17 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -641,7 +641,7 @@ def free_pins(size, evict_active=False):
     return freed_total
 
 def ensure_pin_budget(size, evict_active=False):
-    shortfall = size + comfy.memory_management.RAM_CACHE_HEADROOM / 2 - psutil.virtual_memory().available
+    shortfall = size + max(comfy.memory_management.RAM_CACHE_HEADROOM / 2, 2048 * 1024 ** 2) - psutil.virtual_memory().available
     if shortfall <= 0:
         return True
 

From 4367270fc14672cec56a07f085c096de39257362 Mon Sep 17 00:00:00 2001
From: Rattus <rattus128@gmail.com>
Date: Sat, 30 May 2026 05:57:46 +1000
Subject: [PATCH 11/11] add --fast-disk

---
 comfy/cli_args.py         | 1 +
 comfy/model_management.py | 5 ++++-
 comfy/model_prefetch.py   | 3 ++-
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/comfy/cli_args.py b/comfy/cli_args.py
index 9bda414d1865..a4cabcc65a85 100644
--- a/comfy/cli_args.py
+++ b/comfy/cli_args.py
@@ -149,6 +149,7 @@ def from_string(cls, value: str):
 parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.")
 parser.add_argument("--disable-dynamic-vram", action="store_true", help="Disable dynamic VRAM and use estimate based model loading.")
 parser.add_argument("--enable-dynamic-vram", action="store_true", help="Enable dynamic VRAM on systems where it's not enabled by default.")
+parser.add_argument("--fast-disk", action="store_true", help="Prefer disk-backed dynamic loading and offload over unpinned RAM. Can be faster for users with fast NVME disks.")
 
 parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.")
 
diff --git a/comfy/model_management.py b/comfy/model_management.py
index 013fcf862d17..c264efc2d98f 100644
--- a/comfy/model_management.py
+++ b/comfy/model_management.py
@@ -641,7 +641,10 @@ def free_pins(size, evict_active=False):
     return freed_total
 
 def ensure_pin_budget(size, evict_active=False):
-    shortfall = size + max(comfy.memory_management.RAM_CACHE_HEADROOM / 2, 2048 * 1024 ** 2) - psutil.virtual_memory().available
+    if args.fast_disk:
+        shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY
+    else:
+        shortfall = size + max(comfy.memory_management.RAM_CACHE_HEADROOM / 2, 2048 * 1024 ** 2) - psutil.virtual_memory().available
     if shortfall <= 0:
         return True
 
diff --git a/comfy/model_prefetch.py b/comfy/model_prefetch.py
index 27d852268093..aa6d22d77ebb 100644
--- a/comfy/model_prefetch.py
+++ b/comfy/model_prefetch.py
@@ -60,7 +60,8 @@ def prefetch_queue_pop(queue, device, module):
                     registerable_size += lowvram_fn.memory_required()
 
         offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True)
-        comfy.model_management.ensure_pin_registerable(registerable_size)
+        if not comfy.model_management.args.fast_disk:
+            comfy.model_management.ensure_pin_registerable(registerable_size)
         comfy.model_management.sync_stream(device, offload_stream)
         queue[0] = (offload_stream, (prefetch, comfy_modules))