Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions backend/app/services/music_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,33 +198,43 @@ def detect_optimal_gpu_config() -> dict:

for i in range(num_gpus):
props = torch.cuda.get_device_properties(i)
vram_gb = props.total_memory / (1024 ** 3)
total_vram_gb = props.total_memory / (1024 ** 3)
# Get available/free VRAM (total - already allocated)
with torch.cuda.device(i):
free_vram_bytes = torch.cuda.mem_get_info()[0] # (free, total)
free_vram_gb = free_vram_bytes / (1024 ** 3)
compute_cap = props.major + props.minor / 10
gpu_info[i] = {
"name": props.name,
"vram_gb": vram_gb,
"vram_gb": total_vram_gb,
"vram_free_gb": free_vram_gb,
"compute_capability": compute_cap,
"supports_flash_attention": compute_cap >= 7.0,
}
total_vram += vram_gb
if vram_gb > max_vram:
max_vram = vram_gb
total_vram += total_vram_gb
# Use FREE VRAM for decision making, not total
if free_vram_gb > max_vram:
max_vram = free_vram_gb
max_vram_gpu = i
if compute_cap > max_compute:
max_compute = compute_cap
max_compute_gpu = i

result["gpu_info"] = gpu_info

# Log detected GPUs
# Log detected GPUs with both total and available VRAM
print(f"\n[Auto-Config] Detected {num_gpus} GPU(s):", flush=True)
for i, info in gpu_info.items():
fa_status = "✓ Flash Attention" if info["supports_flash_attention"] else "✗ No Flash Attention"
print(f" GPU {i}: {info['name']} ({info['vram_gb']:.1f} GB, SM {info['compute_capability']}) - {fa_status}", flush=True)
vram_status = f"{info['vram_free_gb']:.1f}GB free / {info['vram_gb']:.1f}GB total"
print(f" GPU {i}: {info['name']} ({vram_status}, SM {info['compute_capability']}) - {fa_status}", flush=True)

# Decision logic for single GPU
if num_gpus == 1:
vram = gpu_info[0]["vram_gb"]
# Use FREE VRAM for threshold decisions (accounts for VRAM used by other apps)
vram = gpu_info[0]["vram_free_gb"]
gpu_total_vram = gpu_info[0]["vram_gb"]
print(f"[Auto-Config] Using FREE VRAM ({vram:.1f}GB) for configuration (total: {gpu_total_vram:.1f}GB)", flush=True)

if vram >= VRAM_THRESHOLD_FULL_PRECISION:
# 20GB+: Full precision, no swapping needed
Expand Down Expand Up @@ -746,6 +756,13 @@ def generate_with_callback(inputs, callback=None, **kwargs):
bs_size = 2 if cfg_scale != 1.0 else 1
pipeline.mula.setup_caches(bs_size)

# Log VRAM usage after cache setup (CUDA only)
if torch.cuda.is_available() and pipeline.mula_device.type == 'cuda':
allocated = torch.cuda.memory_allocated() / 1024**3
reserved = torch.cuda.memory_reserved() / 1024**3
free = torch.cuda.mem_get_info()[0] / 1024**3
print(f"[VRAM] After cache setup: {allocated:.2f}GB allocated, {reserved:.2f}GB reserved, {free:.2f}GB free", flush=True)

with get_autocast_context(pipeline.mula_device.type, pipeline.mula_dtype):
curr_token = pipeline.mula.generate_frame(
tokens=prompt_tokens,
Expand Down Expand Up @@ -1998,7 +2015,7 @@ def _run_pipeline():

with torch.no_grad():
pipeline_inputs = {
"lyrics": request.lyrics,
"lyrics": request.lyrics or "", # heartlib expects string, not None
"tags": sound_tags,
}
if ref_audio_path:
Expand Down