diff --git a/comfy/cli_args.py b/comfy/cli_args.py index 9bda414d1865..a4cabcc65a85 100644 --- a/comfy/cli_args.py +++ b/comfy/cli_args.py @@ -149,6 +149,7 @@ def from_string(cls, value: str): parser.add_argument("--disable-async-offload", action="store_true", help="Disable async weight offloading.") parser.add_argument("--disable-dynamic-vram", action="store_true", help="Disable dynamic VRAM and use estimate based model loading.") parser.add_argument("--enable-dynamic-vram", action="store_true", help="Enable dynamic VRAM on systems where it's not enabled by default.") +parser.add_argument("--fast-disk", action="store_true", help="Prefer disk-backed dynamic loading and offload over unpinned RAM. Can be faster for users with fast NVME disks.") parser.add_argument("--force-non-blocking", action="store_true", help="Force ComfyUI to use non-blocking operations for all applicable tensors. This may improve performance on some non-Nvidia systems but can cause issues with some workflows.") diff --git a/comfy/memory_management.py b/comfy/memory_management.py index 962addb27bf9..e032b7dcdae5 100644 --- a/comfy/memory_management.py +++ b/comfy/memory_management.py @@ -4,6 +4,7 @@ import torch from typing import NamedTuple +import comfy_aimdo.host_buffer from comfy.quant_ops import QuantizedTensor @@ -17,21 +18,18 @@ class TensorFileSlice(NamedTuple): def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=None): if isinstance(tensor, QuantizedTensor): - if not isinstance(destination, QuantizedTensor): - return False - if tensor._layout_cls != destination._layout_cls: - return False - - if not read_tensor_file_slice_into(tensor._qdata, destination._qdata, stream=stream, + if not read_tensor_file_slice_into(tensor._qdata, + destination._qdata if destination is not None else None, stream=stream, destination2=(destination2._qdata if destination2 is not None else None)): return False - dst_orig_dtype = destination._params.orig_dtype - destination._params.copy_from(tensor._params, non_blocking=False) - destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype) + if destination is not None: + dst_orig_dtype = destination._params.orig_dtype + destination._params.copy_from(tensor._params, non_blocking=False) + destination._params = dataclasses.replace(destination._params, orig_dtype=dst_orig_dtype) if destination2 is not None: dst_orig_dtype = destination2._params.orig_dtype - destination2._params.copy_from(destination._params, non_blocking=True) + destination2._params.copy_from(destination._params if destination is not None else tensor._params, non_blocking=True) destination2._params = dataclasses.replace(destination2._params, orig_dtype=dst_orig_dtype) return True @@ -39,10 +37,15 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N if info is None: return False + if destination is not None and destination.device.type != "cpu" and destination2 is None: + destination2 = destination + destination = None + file_obj = info.file_ref - if (destination.device.type != "cpu" - or file_obj is None - or destination.numel() * destination.element_size() < info.size + if (file_obj is None + or (destination is None and destination2 is None) + or (destination is not None and (destination.device.type != "cpu" or destination.numel() * destination.element_size() < info.size)) + or (destination2 is not None and (destination2.device.type == "cpu" or destination2.numel() * destination2.element_size() < info.size)) or tensor.numel() * tensor.element_size() != info.size or tensor.storage_offset() != 0 or not tensor.is_contiguous()): @@ -51,6 +54,14 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N if info.size == 0: return True + if destination is None: + stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 + comfy_aimdo.host_buffer.read_file_to_device(file_obj, info.offset, info.size, + stream_ptr, destination2.data_ptr(), + destination2.device.index, + mark_cold=False) + return True + hostbuf = getattr(destination.untyped_storage(), "_comfy_hostbuf", None) if hostbuf is not None: stream_ptr = getattr(stream, "cuda_stream", 0) if stream is not None else 0 @@ -63,6 +74,9 @@ def read_tensor_file_slice_into(tensor, destination, stream=None, destination2=N device=None if destination2 is None else destination2.device.index) return True + if not hasattr(file_obj, "seek") or not hasattr(file_obj, "readinto"): + return False + buf_type = ctypes.c_ubyte * info.size view = memoryview(buf_type.from_address(destination.data_ptr())) diff --git a/comfy/model_management.py b/comfy/model_management.py index b01c4d7fad21..c264efc2d98f 100644 --- a/comfy/model_management.py +++ b/comfy/model_management.py @@ -641,14 +641,17 @@ def free_pins(size, evict_active=False): return freed_total def ensure_pin_budget(size, evict_active=False): - shortfall = size + comfy.memory_management.RAM_CACHE_HEADROOM / 2 - psutil.virtual_memory().available + if args.fast_disk: + shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY + else: + shortfall = size + max(comfy.memory_management.RAM_CACHE_HEADROOM / 2, 2048 * 1024 ** 2) - psutil.virtual_memory().available if shortfall <= 0: return True to_free = shortfall + PIN_PRESSURE_HYSTERESIS return free_pins(to_free, evict_active=evict_active) >= shortfall -def ensure_pin_registerable(size, evict_active=False): +def ensure_pin_registerable(size, evict_active=True): shortfall = TOTAL_PINNED_MEMORY + size - MAX_PINNED_MEMORY if MAX_PINNED_MEMORY <= 0: return False @@ -658,10 +661,17 @@ def ensure_pin_registerable(size, evict_active=False): shortfall += REGISTERABLE_PIN_HYSTERESIS for loaded_model in reversed(current_loaded_models): model = loaded_model.model - if model is not None and model.is_dynamic() and (evict_active or not model.model.dynamic_pins[model.load_device]["active"]): + if model is not None and model.is_dynamic() and not model.model.dynamic_pins[model.load_device]["active"]: shortfall -= model.unregister_inactive_pins(shortfall) if shortfall <= 0: return True + if evict_active: + for loaded_model in current_loaded_models: + model = loaded_model.model + if model is not None and model.is_dynamic() and model.model.dynamic_pins[model.load_device]["active"]: + shortfall -= model.unregister_inactive_pins(shortfall) + if shortfall <= 0: + return True return shortfall <= REGISTERABLE_PIN_HYSTERESIS class LoadedModel: @@ -1283,7 +1293,6 @@ def current_stream(device): LARGEST_CASTED_WEIGHT = (None, 0) STREAM_AIMDO_CAST_BUFFERS = {} LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) -STREAM_PIN_BUFFERS = {} DEFAULT_AIMDO_CAST_BUFFER_RESERVATION_SIZE = 16 * 1024 ** 3 @@ -1326,42 +1335,13 @@ def get_aimdo_cast_buffer(offload_stream, device): STREAM_AIMDO_CAST_BUFFERS[offload_stream] = cast_buffer return cast_buffer -def get_pin_buffer(offload_stream): - pin_buffer = STREAM_PIN_BUFFERS.get(offload_stream, None) - if pin_buffer is None: - pin_buffer = comfy_aimdo.host_buffer.HostBuffer(0, 0, pinned_hostbuf_size(8 * 1024**3), mark_cold=False) - STREAM_PIN_BUFFERS[offload_stream] = pin_buffer - elif offload_stream is not None: - event = getattr(pin_buffer, "_comfy_event", None) - if event is not None: - event.synchronize() - delattr(pin_buffer, "_comfy_event") - return pin_buffer - -def resize_pin_buffer(pin_buffer, size): - global TOTAL_PINNED_MEMORY - old_size = pin_buffer.size - if size <= old_size: - return True - growth = size - old_size - comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) - ensure_pin_budget(growth, evict_active=True) - ensure_pin_registerable(growth, evict_active=True) - try: - pin_buffer.extend(size=size, reallocate=True) - except RuntimeError: - return False - TOTAL_PINNED_MEMORY += pin_buffer.size - old_size - return True - def reset_cast_buffers(): - global TOTAL_PINNED_MEMORY global LARGEST_CASTED_WEIGHT global LARGEST_AIMDO_CASTED_WEIGHT LARGEST_CASTED_WEIGHT = (None, 0) LARGEST_AIMDO_CASTED_WEIGHT = (None, 0) - for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS) | set(STREAM_PIN_BUFFERS): + for offload_stream in set(STREAM_CAST_BUFFERS) | set(STREAM_AIMDO_CAST_BUFFERS): if offload_stream is not None: offload_stream.synchronize() synchronize() @@ -1370,20 +1350,24 @@ def reset_cast_buffers(): mmap_obj.bounce() DIRTY_MMAPS.clear() - for pin_buffer in STREAM_PIN_BUFFERS.values(): - TOTAL_PINNED_MEMORY -= pin_buffer.size - TOTAL_PINNED_MEMORY = max(0, TOTAL_PINNED_MEMORY) - for loaded_model in current_loaded_models: model = loaded_model.model if model is not None and model.is_dynamic(): - model.model.dynamic_pins[model.load_device]["active"] = False + pin_state = model.model.dynamic_pins[model.load_device] + + if pin_state["active"]: + *_, buckets = pin_state["weights"] + for size, bucket in list(buckets.items()): + bucket[:] = [ entry for entry in bucket if entry[-1] is not None ] + if not bucket: + del buckets[size] + + pin_state["active"] = False model.partially_unload_ram(1e30, subsets=[ "patches" ]) - model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0]) + model.model.dynamic_pins[model.load_device]["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, pinned_hostbuf_size(model.model_size())), [], [-1], [0], [0], {}) STREAM_CAST_BUFFERS.clear() STREAM_AIMDO_CAST_BUFFERS.clear() - STREAM_PIN_BUFFERS.clear() soft_empty_cache() def get_offload_stream(device): @@ -1436,7 +1420,7 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None): if hasattr(wf_context, "as_context"): wf_context = wf_context.as_context(stream) - dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) + dest_views = comfy.memory_management.interpret_gathered_like(tensors, r) if r is not None else [None] * len(tensors) dest2_views = comfy.memory_management.interpret_gathered_like(tensors, r2) if r2 is not None else None with wf_context: for tensor in tensors: @@ -1448,9 +1432,10 @@ def cast_to_gathered(tensors, r, non_blocking=False, stream=None, r2=None): continue storage = tensor._qdata.untyped_storage() if isinstance(tensor, comfy.quant_ops.QuantizedTensor) else tensor.untyped_storage() mark_mmap_dirty(storage) - dest_view.copy_(tensor, non_blocking=non_blocking) + if dest_view is not None: + dest_view.copy_(tensor, non_blocking=non_blocking) if dest2_view is not None: - dest2_view.copy_(dest_view, non_blocking=non_blocking) + dest2_view.copy_(tensor if dest_view is None else dest_view, non_blocking=non_blocking) def cast_to(weight, dtype=None, device=None, non_blocking=False, copy=False, stream=None, r=None): diff --git a/comfy/model_patcher.py b/comfy/model_patcher.py index 00a15fa63b0c..b716a69e223a 100644 --- a/comfy/model_patcher.py +++ b/comfy/model_patcher.py @@ -1721,8 +1721,8 @@ def register_load_device(self, device): """ if device not in self.model.dynamic_pins: self.model.dynamic_pins[device] = { - "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), - "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0]), + "weights": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0], [0], {}), + "patches": (comfy_aimdo.host_buffer.HostBuffer(0, 0, 0), [], [-1], [0], [0], {}), "hostbufs_initialized": False, "failed": False, "active": False, @@ -1799,8 +1799,8 @@ def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False pin_state = self.model.dynamic_pins[self.load_device] if not pin_state["hostbufs_initialized"]: hostbuf_size = comfy.model_management.pinned_hostbuf_size(self.model_size()) - pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0]) - pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0]) + pin_state["weights"] = (comfy_aimdo.host_buffer.HostBuffer(0, 64 * 1024 * 1024, hostbuf_size), [], [-1], [0], [0], {}) + pin_state["patches"] = (comfy_aimdo.host_buffer.HostBuffer(0, 8 * 1024 * 1024, hostbuf_size), [], [-1], [0], [0], {}) pin_state["hostbufs_initialized"] = True pin_state["failed"] = False pin_state["active"] = True @@ -1942,18 +1942,16 @@ def partially_unload(self, device_to, memory_to_free=0, force_patch_weights=Fals return freed def loaded_ram_size(self): - return (self.model.dynamic_pins[self.load_device]["weights"][0].size + - self.model.dynamic_pins[self.load_device]["patches"][0].size) + return (self.model.dynamic_pins[self.load_device]["weights"][0].size) def pinned_memory_size(self): - return (self.model.dynamic_pins[self.load_device]["weights"][3][0] + - self.model.dynamic_pins[self.load_device]["patches"][3][0]) + return (self.model.dynamic_pins[self.load_device]["weights"][3][0]) def unregister_inactive_pins(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack, stack_split, pinned_size = pin_state[subset] + hostbuf, stack, stack_split, pinned_size, *_ = pin_state[subset] split = stack_split[0] while split >= 0: module, offset = stack[split] @@ -1978,10 +1976,12 @@ def partially_unload_ram(self, ram_to_unload, subsets=[ "weights", "patches" ]): freed = 0 pin_state = self.model.dynamic_pins[self.load_device] for subset in subsets: - hostbuf, stack, stack_split, pinned_size = pin_state[subset] + hostbuf, stack, stack_split, pinned_size, *_ = pin_state[subset] while len(stack) > 0: module, offset = stack.pop() size = module._pin.numel() * module._pin.element_size() + module._pin_balancer_entry[-1] = None + del module._pin_balancer_entry del module._pin hostbuf.truncate(offset, do_unregister=module._pin_registered) stack_split[0] = min(stack_split[0], len(stack) - 1) diff --git a/comfy/model_prefetch.py b/comfy/model_prefetch.py index 72e11dec6a1a..aa6d22d77ebb 100644 --- a/comfy/model_prefetch.py +++ b/comfy/model_prefetch.py @@ -1,4 +1,5 @@ import comfy_aimdo.model_vbar +import comfy.memory_management import comfy.model_management import comfy.ops @@ -50,7 +51,17 @@ def prefetch_queue_pop(queue, device, module): if hasattr(s, "_v"): comfy_modules.append(s) + registerable_size = 0 + for s in comfy_modules: + registerable_size += comfy.memory_management.vram_aligned_size([s.weight, s.bias]) + for param_key in ("weight", "bias"): + lowvram_fn = getattr(s, param_key + "_lowvram_function", None) + if lowvram_fn is not None: + registerable_size += lowvram_fn.memory_required() + offload_stream = comfy.ops.cast_modules_with_vbar(comfy_modules, None, device, None, True) + if not comfy.model_management.args.fast_disk: + comfy.model_management.ensure_pin_registerable(registerable_size) comfy.model_management.sync_stream(device, offload_stream) queue[0] = (offload_stream, (prefetch, comfy_modules)) diff --git a/comfy/ops.py b/comfy/ops.py index 56445be8d4c2..119177c374f9 100644 --- a/comfy/ops.py +++ b/comfy/ops.py @@ -76,8 +76,6 @@ def scaled_dot_product_attention(q, k, v, *args, **kwargs): cast_to = comfy.model_management.cast_to #TODO: remove once no more references -STREAM_PIN_BUFFER_HEADROOM = 8 * 1024 * 1024 - def cast_to_input(weight, input, non_blocking=False, copy=True): return comfy.model_management.cast_to(weight, input.dtype, input.device, non_blocking=non_blocking, copy=copy) @@ -94,9 +92,6 @@ def cast_modules_with_vbar(comfy_modules, dtype, device, bias_dtype, non_blockin offload_stream = None cast_buffer = None cast_buffer_offset = 0 - stream_pin_hostbuf = None - stream_pin_offset = 0 - stream_pin_queue = [] def ensure_offload_stream(module, required_size, check_largest): nonlocal offload_stream @@ -130,22 +125,6 @@ def get_cast_buffer(buffer_size): cast_buffer_offset += buffer_size return buffer - def get_stream_pin_buffer_offset(buffer_size): - nonlocal stream_pin_hostbuf - nonlocal stream_pin_offset - - if buffer_size == 0 or offload_stream is None: - return None - - if stream_pin_hostbuf is None: - stream_pin_hostbuf = comfy.model_management.get_pin_buffer(offload_stream) - if stream_pin_hostbuf is None: - return None - - offset = stream_pin_offset - stream_pin_offset += buffer_size - return offset - for s in comfy_modules: signature = comfy_aimdo.model_vbar.vbar_fault(s._v) resident = comfy_aimdo.model_vbar.vbar_signature_compare(signature, s._v_signature) @@ -184,12 +163,18 @@ def get_stream_pin_buffer_offset(buffer_size): if xfer_dest is None: xfer_dest = get_cast_buffer(dest_size) - def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream): + def cast_maybe_lowvram_patch(xfer_source, xfer_dest, stream, xfer_dest2=None): if xfer_source is not None: if getattr(xfer_source, "is_lowvram_patch", False): - xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) - else: - comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream) + if xfer_dest is not None: + xfer_source.prepare(xfer_dest, stream, copy=True, commit=False) + xfer_source = [ xfer_dest ] + xfer_dest = xfer_dest2 + xfer_dest2 = None + elif xfer_dest2 is not None: + xfer_source.prepare(xfer_dest2, stream, copy=True, commit=False) + return + comfy.model_management.cast_to_gathered(xfer_source, xfer_dest, non_blocking=non_blocking, stream=stream, r2=xfer_dest2) def handle_pin(m, pin, source, dest, subset="weights", size=None): if pin is not None: @@ -198,19 +183,7 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None): if signature is None: comfy.pinned_memory.pin_memory(m, subset=subset, size=size) pin = comfy.pinned_memory.get_pin(m, subset=subset) - if pin is not None: - if isinstance(source, list): - comfy.model_management.cast_to_gathered(source, pin, non_blocking=non_blocking, stream=offload_stream, r2=dest) - else: - cast_maybe_lowvram_patch(source, pin, None) - cast_maybe_lowvram_patch([ pin ], dest, offload_stream) - return - if pin is None: - pin_offset = get_stream_pin_buffer_offset(size) - if pin_offset is not None: - stream_pin_queue.append((source, pin_offset, size, dest)) - return - cast_maybe_lowvram_patch(source, dest, offload_stream) + cast_maybe_lowvram_patch(source, pin, offload_stream, xfer_dest2=dest) handle_pin(s, pin, xfer_source, xfer_dest, size=dest_size) @@ -232,23 +205,6 @@ def handle_pin(m, pin, source, dest, subset="weights", size=None): prefetch["needs_cast"] = needs_cast s._prefetch = prefetch - if stream_pin_offset > 0: - if stream_pin_hostbuf.size < stream_pin_offset: - if not comfy.model_management.resize_pin_buffer(stream_pin_hostbuf, stream_pin_offset + STREAM_PIN_BUFFER_HEADROOM): - for xfer_source, _, _, xfer_dest in stream_pin_queue: - cast_maybe_lowvram_patch(xfer_source, xfer_dest, offload_stream) - return offload_stream - stream_pin_tensor = comfy_aimdo.torch.hostbuf_to_tensor(stream_pin_hostbuf) - stream_pin_tensor.untyped_storage()._comfy_hostbuf = stream_pin_hostbuf - for xfer_source, pin_offset, pin_size, xfer_dest in stream_pin_queue: - pin = stream_pin_tensor[pin_offset:pin_offset + pin_size] - if isinstance(xfer_source, list): - comfy.model_management.cast_to_gathered(xfer_source, pin, non_blocking=non_blocking, stream=offload_stream, r2=xfer_dest) - else: - cast_maybe_lowvram_patch(xfer_source, pin, None) - comfy.model_management.cast_to_gathered([ pin ], xfer_dest, non_blocking=non_blocking, stream=offload_stream) - stream_pin_hostbuf._comfy_event = offload_stream.record_event() - return offload_stream diff --git a/comfy/pinned_memory.py b/comfy/pinned_memory.py index 0e8f573ba0ee..ffe12e0dc0e9 100644 --- a/comfy/pinned_memory.py +++ b/comfy/pinned_memory.py @@ -1,17 +1,55 @@ +import bisect + import comfy.model_management import comfy.memory_management +import comfy.utils import comfy_aimdo.host_buffer import comfy_aimdo.torch import torch from comfy.cli_args import args +def _add_to_bucket(module, buckets, size, priority): + bucket = buckets.setdefault(size, []) + entry = [-priority, 0, module] + entry[1] = id(entry) + bisect.insort(bucket, entry) + module._pin_balancer_entry = entry + +def _steal_pin(module, stack, buckets, size, priority): + bucket = buckets.get(size) + if bucket is None: + return False + + while bucket and bucket[-1][-1] is None: + bucket.pop() + if not bucket: + del buckets[size] + return False + + if priority <= -bucket[-1][0]: + return False + + *_, victim = bucket.pop() + module._pin = victim._pin + module._pin_registered = victim._pin_registered + module._pin_stack_index = victim._pin_stack_index + stack[module._pin_stack_index] = (module, stack[module._pin_stack_index][1]) + + victim._pin_registered = False + del victim._pin + del victim._pin_stack_index + del victim._pin_balancer_entry + + _add_to_bucket(module, buckets, size, priority) + return True + def get_pin(module, subset="weights"): pin = getattr(module, "_pin", None) if pin is None or module._pin_registered or args.disable_pinned_memory: return pin - _, _, stack_split, pinned_size = module._pin_state[subset] + _, _, stack_split, pinned_size, *_ = module._pin_state[subset] size = pin.nbytes comfy.model_management.ensure_pin_registerable(size) @@ -31,26 +69,30 @@ def pin_memory(module, subset="weights", size=None): return pin = get_pin(module, subset) - if pin is not None or pin_state["failed"]: + if pin is not None: return - hostbuf, stack, stack_split, pinned_size = pin_state[subset] + hostbuf, stack, stack_split, pinned_size, counter, buckets = pin_state[subset] if size is None: size = comfy.memory_management.vram_aligned_size([ module.weight, module.bias ]) offset = hostbuf.size - registerable_size = size + max(0, hostbuf.size - pinned_size[0]) + registerable_size = size + priority = getattr(module, "_pin_balancer_priority", None) + + if priority is None: + priority = comfy.utils.bit_reverse_range(counter[0], 16) + counter[0] += 1 + module._pin_balancer_priority = priority comfy.memory_management.extra_ram_release(comfy.memory_management.RAM_CACHE_HEADROOM) if (not comfy.model_management.ensure_pin_budget(size) or not comfy.model_management.ensure_pin_registerable(registerable_size)): - pin_state["failed"] = True - return False + return _steal_pin(module, stack, buckets, size, priority) try: hostbuf.extend(size=size) except RuntimeError: - pin_state["failed"] = True - return False + return _steal_pin(module, stack, buckets, size, priority) module._pin = comfy_aimdo.torch.hostbuf_to_tensor(hostbuf)[offset:offset + size] module._pin.untyped_storage()._comfy_hostbuf = hostbuf @@ -60,4 +102,5 @@ def pin_memory(module, subset="weights", size=None): stack_split[0] = max(stack_split[0], module._pin_stack_index) comfy.model_management.TOTAL_PINNED_MEMORY += size pinned_size[0] += size + _add_to_bucket(module, buckets, size, priority) return True diff --git a/comfy/utils.py b/comfy/utils.py index 49ae12b0660c..09d783ffff91 100644 --- a/comfy/utils.py +++ b/comfy/utils.py @@ -85,9 +85,9 @@ def encode(*args, **kwargs): # no longer necessary on newer torch def load_safetensors(ckpt): import comfy_aimdo.model_mmap - f = open(ckpt, "rb", buffering=0) file_lock = threading.Lock() model_mmap = comfy_aimdo.model_mmap.ModelMMAP(ckpt) + f = model_mmap.get_file_handle() file_size = os.path.getsize(ckpt) mv = memoryview((ctypes.c_uint8 * file_size).from_address(model_mmap.get())) @@ -1452,3 +1452,10 @@ def deepcopy_list_dict(obj, memo=None): memo[obj_id] = res return res + +def bit_reverse_range(index, bits): + result = 0 + for _ in range(bits): + result = (result << 1) | (index & 1) + index >>= 1 + return result diff --git a/requirements.txt b/requirements.txt index 0617667e1ba5..a42e3905812c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -23,7 +23,7 @@ SQLAlchemy>=2.0.0 filelock av>=16.0.0 comfy-kitchen==0.2.9 -comfy-aimdo==0.4.5 +comfy-aimdo==0.4.7 requests simpleeval>=1.0.0 blake3