From 0b7cf944ee8f498a1a1f3060d916d86082eb542e Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 30 Jan 2026 08:56:04 +0000 Subject: [PATCH 1/3] Initial plan From d5b4c8f5598bf5910ad9c795f17267b539bc2307 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 30 Jan 2026 08:58:49 +0000 Subject: [PATCH 2/3] Port upstream VRAM detection and lyrics fixes from fspecii fork Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- backend/app/services/music_service.py | 35 ++++++++++++++++++++------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py index 07dac67..fa7c0a5 100644 --- a/backend/app/services/music_service.py +++ b/backend/app/services/music_service.py @@ -198,17 +198,23 @@ def detect_optimal_gpu_config() -> dict: for i in range(num_gpus): props = torch.cuda.get_device_properties(i) - vram_gb = props.total_memory / (1024 ** 3) + total_vram_gb = props.total_memory / (1024 ** 3) + # Get available/free VRAM (total - already allocated) + with torch.cuda.device(i): + free_vram_bytes = torch.cuda.mem_get_info()[0] # (free, total) + free_vram_gb = free_vram_bytes / (1024 ** 3) compute_cap = props.major + props.minor / 10 gpu_info[i] = { "name": props.name, - "vram_gb": vram_gb, + "vram_gb": total_vram_gb, + "vram_free_gb": free_vram_gb, "compute_capability": compute_cap, "supports_flash_attention": compute_cap >= 7.0, } - total_vram += vram_gb - if vram_gb > max_vram: - max_vram = vram_gb + total_vram += total_vram_gb + # Use FREE VRAM for decision making, not total + if free_vram_gb > max_vram: + max_vram = free_vram_gb max_vram_gpu = i if compute_cap > max_compute: max_compute = compute_cap @@ -216,15 +222,19 @@ def detect_optimal_gpu_config() -> dict: result["gpu_info"] = gpu_info - # Log detected GPUs + # Log detected GPUs with both total and available VRAM print(f"\n[Auto-Config] Detected {num_gpus} GPU(s):", flush=True) for i, info in gpu_info.items(): fa_status = "✓ Flash Attention" if info["supports_flash_attention"] else "✗ No Flash Attention" - print(f" GPU {i}: {info['name']} ({info['vram_gb']:.1f} GB, SM {info['compute_capability']}) - {fa_status}", flush=True) + vram_status = f"{info['vram_free_gb']:.1f}GB free / {info['vram_gb']:.1f}GB total" + print(f" GPU {i}: {info['name']} ({vram_status}, SM {info['compute_capability']}) - {fa_status}", flush=True) # Decision logic for single GPU if num_gpus == 1: - vram = gpu_info[0]["vram_gb"] + # Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps) + vram = gpu_info[0]["vram_free_gb"] + total_vram = gpu_info[0]["vram_gb"] + print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {total_vram:.1f}GB)", flush=True) if vram >= VRAM_THRESHOLD_FULL_PRECISION: # 20GB+: Full precision, no swapping needed @@ -746,6 +756,13 @@ def generate_with_callback(inputs, callback=None, **kwargs): bs_size = 2 if cfg_scale != 1.0 else 1 pipeline.mula.setup_caches(bs_size) + # Log VRAM usage after cache setup + if torch.cuda.is_available(): + allocated = torch.cuda.memory_allocated() / 1024**3 + reserved = torch.cuda.memory_reserved() / 1024**3 + free = torch.cuda.mem_get_info()[0] / 1024**3 + print(f"[VRAM] After cache setup: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {free:.2f}GB free", flush=True) + with get_autocast_context(pipeline.mula_device.type, pipeline.mula_dtype): curr_token = pipeline.mula.generate_frame( tokens=prompt_tokens, @@ -1998,7 +2015,7 @@ def _run_pipeline(): with torch.no_grad(): pipeline_inputs = { - "lyrics": request.lyrics, + "lyrics": request.lyrics or "", # heartlib expects string, not None "tags": sound_tags, } if ref_audio_path: From f0d55fe24ae8386d65b37d842f36b74c9a51a59f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 30 Jan 2026 09:02:36 +0000 Subject: [PATCH 3/3] Address code review feedback - improve MPS isolation and variable naming Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- backend/app/services/music_service.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py index fa7c0a5..2b3c099 100644 --- a/backend/app/services/music_service.py +++ b/backend/app/services/music_service.py @@ -233,8 +233,8 @@ def detect_optimal_gpu_config() -> dict: if num_gpus == 1: # Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps) vram = gpu_info[0]["vram_free_gb"] - total_vram = gpu_info[0]["vram_gb"] - print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {total_vram:.1f}GB)", flush=True) + gpu_total_vram = gpu_info[0]["vram_gb"] + print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {gpu_total_vram:.1f}GB)", flush=True) if vram >= VRAM_THRESHOLD_FULL_PRECISION: # 20GB+: Full precision, no swapping needed @@ -756,8 +756,8 @@ def generate_with_callback(inputs, callback=None, **kwargs): bs_size = 2 if cfg_scale != 1.0 else 1 pipeline.mula.setup_caches(bs_size) - # Log VRAM usage after cache setup - if torch.cuda.is_available(): + # Log VRAM usage after cache setup (CUDA only) + if torch.cuda.is_available() and pipeline.mula_device.type == 'cuda': allocated = torch.cuda.memory_allocated() / 1024**3 reserved = torch.cuda.memory_reserved() / 1024**3 free = torch.cuda.mem_get_info()[0] / 1024**3