Sonic-Vision/utils.py at main · Muhammad-Rebaal/Sonic-Vision · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import torch
import torch.nn as nn
import torchaudio.transforms as T
import numpy as np
import librosa
import io
import soundfile as sf

class AudioProcessor:
    def __init__(self):
        self.transform = nn.Sequential(
            T.MelSpectrogram(
                sample_rate=44100,
                n_fft=1024,
                hop_length=512,
                n_mels=128,
                f_min=0,
                f_max=11025
            ),
            T.AmplitudeToDB()
        )

    def process_audio_chunk(self, audio_data):
        waveform = torch.from_numpy(audio_data).float()
        waveform = waveform.unsqueeze(0)
        spectrogram = self.transform(waveform)
        return spectrogram.unsqueeze(0)

def process_uploaded_audio(audio_bytes):
    """
    Reads audio bytes, resamples to 44100Hz, and converts to mono.
    Returns:
        audio_data (np.array): The processed audio waveform.
        sample_rate (int): The sample rate (always 44100).
    """
    # Read the audio bytes
    audio_data, sample_rate = sf.read(io.BytesIO(audio_bytes), dtype='float32')

    # Convert to mono if stereo
    if audio_data.ndim > 1:
        audio_data = np.mean(audio_data, axis=1)

    # Resample to 44100Hz if needed
    if sample_rate != 44100:
        audio_data = librosa.resample(
            y=audio_data, orig_sr=sample_rate,
            target_sr=44100
        )
        sample_rate = 44100

    return audio_data, sample_rate