diff --git a/README.md b/README.md index bcbbffe..baee9fd 100644 --- a/README.md +++ b/README.md @@ -2,13 +2,11 @@ ![logo](https://github.com/Saurabh620/Voice-Signal-Processing-using-Python-GUI/blob/main/Screenshot%202023-12-21%20192640.png) - - -In this project we used TESS voice dataset and processed it and perform emotion prediction +In this project we used TESS voice dataset and processed it and perform emotion prediction. The principal objective of this endeavor is to employ sophisticated Signal Processing methodologies for the -purpose of decoding and scrutinizing emotional cues present in speech signals. +purpose of decoding and scrutinizing emotional cues present in speech signals. -CNN and LSTM model is used in this project +CNN and LSTM model is used in this project. The system must effectively prepare the TESS dataset through various preprocessing steps, ensuring the data is cleaned, normalized, and segmented for consistent analysis. It should implement signal processing techniques such as waveplot generation, spectrogram computation, and feature extraction methods like @@ -18,13 +16,38 @@ Dataset used in this project Toronto emotional speech set (TESS) https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess ------------------------------------------------------------------------------- -STEPS TO RUN THIS PROJECT +## Steps to run this project + +1. Download the dataset and add your path for dataset. +2. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Train/evaluate model using notebook: + - `MainProgram.ipynb` +4. Make sure trained model exists at: + - `model/SER_model.h5` + +------------------------------------------------------------------------------------ +## Streamlit UI (optimized) + +This repo now includes a Streamlit interface in `streamlit_app.py`. + +Run it with: + +```bash +streamlit run streamlit_app.py +``` + +Features: +- Upload audio files (wav/mp3/ogg/flac/m4a) +- Automatic MFCC extraction (aligned with your notebook strategy) +- Class probability table + chart +- Fast model reuse via Streamlit cache -first download the dataset and add your path for dataset -run pip install, " -r requirements.txt", -run firstly MainProgram -then for GUI run Launcher ------------------------------------------------------------------------------------ -use Jupyter for best result +Use Jupyter for training workflow and Streamlit for interactive prediction UI. -------------------------------------------------------------------------------------- -If you want documentation of this project and some help ,mail me on rawatsaurabh620@gmail.com +If you want documentation of this project and some help, mail me on rawatsaurabh620@gmail.com diff --git a/requirements.txt b/requirements.txt index c864bf4..9bc916d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,3 +10,4 @@ tensorflow==2.6.0 keras==2.6.0 sounddevice==0.4.2 soundfile==0.10.3.post1 +streamlit==1.45.1 diff --git a/streamlit_app.py b/streamlit_app.py new file mode 100644 index 0000000..c22695c --- /dev/null +++ b/streamlit_app.py @@ -0,0 +1,104 @@ +import io +from pathlib import Path + +import librosa +import numpy as np +import soundfile as sf +import streamlit as st + +try: + from keras.models import load_model +except Exception: # fallback for some environments + from tensorflow.keras.models import load_model + +EMOTIONS = [ + "Angry", + "Disgust", + "Fear", + "Happy", + "Neutral", + "Pleasant Surprise", + "Sad", +] + +MODEL_DEFAULT_PATH = "model/SER_model.h5" + + +def extract_mfcc_from_bytes(audio_bytes: bytes) -> np.ndarray: + """Load audio bytes and return the model input tensor shape (1, 40, 1).""" + audio, sr = sf.read(io.BytesIO(audio_bytes), dtype="float32") + + if audio.ndim > 1: + audio = np.mean(audio, axis=1) + + # Match notebook/GUI preprocessing as closely as possible. + target_sr = 22050 + if sr != target_sr: + audio = librosa.resample(audio, orig_sr=sr, target_sr=target_sr) + sr = target_sr + + # Use the same effective slicing strategy as existing notebook code. + start = int(0.5 * sr) + end = start + int(3.0 * sr) + if len(audio) < end: + pad = end - len(audio) + audio = np.pad(audio, (0, pad), mode="constant") + sliced = audio[start:end] + + mfcc = librosa.feature.mfcc(y=sliced, sr=sr, n_mfcc=40) + mfcc_mean = np.mean(mfcc.T, axis=0) + + return mfcc_mean.reshape(1, 40, 1) + + +@st.cache_resource(show_spinner=False) +def get_model(model_path: str): + return load_model(model_path) + + +def main() -> None: + st.set_page_config(page_title="Speech Emotion Recognition", page_icon="🎙️", layout="centered") + + st.title("🎙️ Speech Emotion Recognition") + st.caption("Upload a WAV/MP3 audio file and predict emotion using your trained SER model.") + + model_path = st.sidebar.text_input("Model path", value=MODEL_DEFAULT_PATH) + model_exists = Path(model_path).exists() + if not model_exists: + st.warning(f"Model file not found: `{model_path}`. Upload audio is disabled until model is available.") + + uploaded = st.file_uploader("Upload audio", type=["wav", "mp3", "ogg", "flac", "m4a"]) + + if uploaded and model_exists: + audio_bytes = uploaded.read() + st.audio(audio_bytes) + + with st.spinner("Extracting features and running prediction..."): + model = get_model(model_path) + features = extract_mfcc_from_bytes(audio_bytes) + probs = model.predict(features, verbose=0)[0] + + if len(probs) != len(EMOTIONS): + st.error( + f"Model output has {len(probs)} classes, but UI expects {len(EMOTIONS)} emotions. " + "Please align EMOTIONS with your training label order." + ) + return + + pred_idx = int(np.argmax(probs)) + pred_label = EMOTIONS[pred_idx] + pred_conf = float(probs[pred_idx]) + + st.success(f"Predicted emotion: **{pred_label}** ({pred_conf:.2%})") + + st.subheader("Class probabilities") + table_rows = [{"emotion": label, "probability": float(p)} for label, p in zip(EMOTIONS, probs)] + st.dataframe(table_rows, use_container_width=True) + st.bar_chart({"probability": probs}, x_label="class_index", y_label="probability") + + st.markdown("---") + st.markdown("**Tips**: Keep audio clear and speech-focused. Very noisy clips may reduce accuracy.") + + +if __name__ == "__main__": + main()