From 0b7cf944ee8f498a1a1f3060d916d86082eb542e Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 30 Jan 2026 08:56:04 +0000
Subject: [PATCH 1/3] Initial plan


From d5b4c8f5598bf5910ad9c795f17267b539bc2307 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 30 Jan 2026 08:58:49 +0000
Subject: [PATCH 2/3] Port upstream VRAM detection and lyrics fixes from
 fspecii fork

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 backend/app/services/music_service.py | 35 ++++++++++++++++++++-------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py
index 07dac67..fa7c0a5 100644
--- a/backend/app/services/music_service.py
+++ b/backend/app/services/music_service.py
@@ -198,17 +198,23 @@ def detect_optimal_gpu_config() -> dict:
 
     for i in range(num_gpus):
         props = torch.cuda.get_device_properties(i)
-        vram_gb = props.total_memory / (1024 ** 3)
+        total_vram_gb = props.total_memory / (1024 ** 3)
+        # Get available/free VRAM (total - already allocated)
+        with torch.cuda.device(i):
+            free_vram_bytes = torch.cuda.mem_get_info()[0]  # (free, total)
+            free_vram_gb = free_vram_bytes / (1024 ** 3)
         compute_cap = props.major + props.minor / 10
         gpu_info[i] = {
             "name": props.name,
-            "vram_gb": vram_gb,
+            "vram_gb": total_vram_gb,
+            "vram_free_gb": free_vram_gb,
             "compute_capability": compute_cap,
             "supports_flash_attention": compute_cap >= 7.0,
         }
-        total_vram += vram_gb
-        if vram_gb > max_vram:
-            max_vram = vram_gb
+        total_vram += total_vram_gb
+        # Use FREE VRAM for decision making, not total
+        if free_vram_gb > max_vram:
+            max_vram = free_vram_gb
             max_vram_gpu = i
         if compute_cap > max_compute:
             max_compute = compute_cap
@@ -216,15 +222,19 @@ def detect_optimal_gpu_config() -> dict:
 
     result["gpu_info"] = gpu_info
 
-    # Log detected GPUs
+    # Log detected GPUs with both total and available VRAM
     print(f"\n[Auto-Config] Detected {num_gpus} GPU(s):", flush=True)
     for i, info in gpu_info.items():
         fa_status = "✓ Flash Attention" if info["supports_flash_attention"] else "✗ No Flash Attention"
-        print(f"  GPU {i}: {info['name']} ({info['vram_gb']:.1f} GB, SM {info['compute_capability']}) - {fa_status}", flush=True)
+        vram_status = f"{info['vram_free_gb']:.1f}GB free / {info['vram_gb']:.1f}GB total"
+        print(f"  GPU {i}: {info['name']} ({vram_status}, SM {info['compute_capability']}) - {fa_status}", flush=True)
 
     # Decision logic for single GPU
     if num_gpus == 1:
-        vram = gpu_info[0]["vram_gb"]
+        # Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps)
+        vram = gpu_info[0]["vram_free_gb"]
+        total_vram = gpu_info[0]["vram_gb"]
+        print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {total_vram:.1f}GB)", flush=True)
 
         if vram >= VRAM_THRESHOLD_FULL_PRECISION:
             # 20GB+: Full precision, no swapping needed
@@ -746,6 +756,13 @@ def generate_with_callback(inputs, callback=None, **kwargs):
         bs_size = 2 if cfg_scale != 1.0 else 1
         pipeline.mula.setup_caches(bs_size)
 
+        # Log VRAM usage after cache setup
+        if torch.cuda.is_available():
+            allocated = torch.cuda.memory_allocated() / 1024**3
+            reserved = torch.cuda.memory_reserved() / 1024**3
+            free = torch.cuda.mem_get_info()[0] / 1024**3
+            print(f"[VRAM] After cache setup: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {free:.2f}GB free", flush=True)
+
         with get_autocast_context(pipeline.mula_device.type, pipeline.mula_dtype):
             curr_token = pipeline.mula.generate_frame(
                 tokens=prompt_tokens,
@@ -1998,7 +2015,7 @@ def _run_pipeline():
 
                     with torch.no_grad():
                         pipeline_inputs = {
-                            "lyrics": request.lyrics,
+                            "lyrics": request.lyrics or "",  # heartlib expects string, not None
                             "tags": sound_tags,
                         }
                         if ref_audio_path:

From f0d55fe24ae8386d65b37d842f36b74c9a51a59f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 30 Jan 2026 09:02:36 +0000
Subject: [PATCH 3/3] Address code review feedback - improve MPS isolation and
 variable naming

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 backend/app/services/music_service.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py
index fa7c0a5..2b3c099 100644
--- a/backend/app/services/music_service.py
+++ b/backend/app/services/music_service.py
@@ -233,8 +233,8 @@ def detect_optimal_gpu_config() -> dict:
     if num_gpus == 1:
         # Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps)
         vram = gpu_info[0]["vram_free_gb"]
-        total_vram = gpu_info[0]["vram_gb"]
-        print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {total_vram:.1f}GB)", flush=True)
+        gpu_total_vram = gpu_info[0]["vram_gb"]
+        print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {gpu_total_vram:.1f}GB)", flush=True)
 
         if vram >= VRAM_THRESHOLD_FULL_PRECISION:
             # 20GB+: Full precision, no swapping needed
@@ -756,8 +756,8 @@ def generate_with_callback(inputs, callback=None, **kwargs):
         bs_size = 2 if cfg_scale != 1.0 else 1
         pipeline.mula.setup_caches(bs_size)
 
-        # Log VRAM usage after cache setup
-        if torch.cuda.is_available():
+        # Log VRAM usage after cache setup (CUDA only)
+        if torch.cuda.is_available() and pipeline.mula_device.type == 'cuda':
             allocated = torch.cuda.memory_allocated() / 1024**3
             reserved = torch.cuda.memory_reserved() / 1024**3
             free = torch.cuda.mem_get_info()[0] / 1024**3