# Import libs
import librosa
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import os
# CREMA-D filename structure: ActorID_SentenceID_EmotionID_IntensityID.wav
audio_dir = "./data/CREMA_D/"
def parse_cremad_filename(file_path):
name = os.path.splitext(os.path.basename(file_path))[0]
parts = name.split('_')
actor_id = parts[0]
sentence_id = parts[1]
emotion = parts[2]
intensity = parts[3].split('.')[0]
# mapping emotion_id to emotions
emotion_map = {"ANG":"angry"
,"DIS":"disgust"
,"FEA":"fear"
,"HAP":"happy"
,"NEU":"neutral"
,"SAD":"sad"}
return {
"actor_id": actor_id
,"sentence": sentence_id
,"emotion": emotion_map[emotion]
,"intensity": intensity
}
def extract_features(file_path):
"""
Iteratively extracts audio features from a given .wav file using librosa.
Features:
- Zero Crossing Rate
- Chroma STFT
- MFCCs
- Root Mean Square Energy
- Spectral Centroid
Returns:
list of dict: Each dictionary containing the features, and metadata parsed from the filename.
"""
# initalize empty list to store all features
all_features = []
# iteratively scan os directory
for entry in os.scandir(file_path):
if entry.is_file():
# load .wav files iteratively from entry path
y, sr = librosa.load(entry.path, sr=22050)
# initiallize empty dict
features = {}
# 0. File Metadata
fmd = parse_cremad_filename(entry.name)
audio_duration = len(y)/ sr
features['actor_id'] = fmd['actor_id']
features['sentence'] = fmd['sentence']
features['emotion'] = fmd['emotion']
features['intensity'] = fmd['intensity']
features['audio_duration'] = audio_duration
features['sample_rate'] = sr
# 1. MFCCs (most important for speech)
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
for i in range(13):
features[f'mfcc_{i+1}_mean'] = np.mean(mfccs[i])
features[f'mfcc_{i+1}_std'] = np.std(mfccs[i])
# 2. Spectral features
spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
features['spectral_centroid_mean'] = np.mean(spectral_centroid)
features['spectral_centroid_std'] = np.std(spectral_centroid)
spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
features['spectral_bandwidth_mean'] = np.mean(spectral_bandwidth)
# 3. Energy and rhythm
rms = librosa.feature.rms(y=y)
features['rms_mean'] = np.mean(rms)
features['rms_std'] = np.std(rms)
zcr = librosa.feature.zero_crossing_rate(y)
features['zcr_mean'] = np.mean(zcr)
# 4. Pitch and harmony
chroma = librosa.feature.chroma_stft(y=y, sr=sr)
features['chroma_mean'] = np.mean(chroma)
features['chroma_std'] = np.std(chroma)
all_features.append(features)
return all_features
# Function call code has been commented since the data has been extracted and converted into csv file for consumption.
# call function, while passing directory of audio files
#audio_features = extract_features(audio_dir)
# convert to a data frame
#df = pd.DataFrame(audio_features)
# export dataframe to csv for repoducibility
#df.to_csv("./data/crema_d.csv")
df = pd.read_csv("./data/crema_d.csv", index_col=0)
Acoustic Emotion Recognition
accuracy of unsupervised methods for recognition emotion from acoustic features
Project Overview
This project investigates the application of machine learning techniques to classify emotions from speech audio using the CREMA-D (Crowd-Sourced Emotional Multimodal Actors Dataset). The study focuses on distinguishing four primary emotions; happy, sad, angry, fear, neutral, and disgust, through acoustic feature analysis and explores how emotional intensity affects classification performance. By combining traditional feature engineering with ensemble learning methods, this research aims to develop robust emotion recognition models while examining the relationship between speech intensity and model confidence.
The project addresses two fundamental questions in acoustic emotion recognition: first, whether traditional machine learning algorithms can accurately classify emotions using engineered audio features, and second, how ensemble methods can significantly improve classification performance compared to individual models. This investigation contributes to the growing field of effective computing while demonstrating practical applications of ensemble learning principles in speech emotion recognition.
Dataset Description
Primary Dataset: CREMA-D (Crowd-Sourced Emotional Multimodal Actors Dataset) Provenance: CREMA-D is a validated multi modal database created through collaboration between researchers and crowd sourced validation. The dataset contains emotional speech recordings from professional actors, making it ideal for supervised learning approaches to emotion classification.
Dimensions & Structure:
Attribute | Details |
---|---|
Total Files | 7,442 audio clips from diverse emotional expressions |
Speakers | 91 professional actors (48 male, 43 female) |
Age Range | 20–74 years, providing demographic diversity |
Target Emotions | 6 emotions selected for analysis (happy, sad, angry, fear, neutral, disgust) |
Emotional Intensities | Multiple intensity levels (low, medium, high, unspecified) |
File Format | WAV files suitable for feature extraction |
Sentence Variety | 12 different sentences to reduce linguistic bias |
Dataset Selection Rationale: CREMA-D was chosen for its substantial size, demographic diversity, and established use in emotion recognition research. The dataset’s systematic organization and intensity labels directly support both research questions, while its availability through Kaggle ensures reproducible research practices.
Feature Engineering
The analysis employs five key acoustic feature categories using the librosa library:
1. Spectral Contrast: Measures amplitude differences between spectral peaks and valleys, capturing timbral characteristics that distinguish emotional expressions
2. MFCCs (Mel-frequency cepstral coefficients): Extract 13 coefficients representing the short-term power spectrum, fundamental for speech emotion recognition
3. Chroma Features: Capture pitch class energy distribution, providing harmonic content information relevant to emotional prosody
4. Zero-Crossing Rate: Quantifies signal noisiness by measuring zero-axis crossings, distinguishing between voiced and unvoiced speech segments
5. Root Mean Square (RMS) Energy: Measures overall signal energy, correlating with loudness and emotional intensity
These features collectively capture the three primary acoustic dimensions of emotional expression: spectral characteristics, temporal dynamics, and energy distribution. This multi-dimensional approach aligns with established psychoacoustic research showing that human emotional perception relies on diverse auditory cues processed simultaneously.
Signal Processing (Extract, Transform, Load)
This pipeline enables the conversion of complex, high-dimensional waveforms into compact, informative representations—such as MFCCs or spectral centroids—that capture the essence of the sound.
Dataset
actor_id | sentence | emotion | intensity | audio_duration | sample_rate | mfcc_1_mean | mfcc_1_std | mfcc_2_mean | mfcc_2_std | ... | mfcc_13_std | spectral_centroid_mean | spectral_centroid_std | spectral_rolloff_mean | spectral_bandwidth_mean | rms_mean | rms_std | zcr_mean | chroma_mean | chroma_std | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1022 | ITS | angry | XX | 2.435782 | 22050 | -266.28894 | 123.935930 | 108.554596 | 39.694670 | ... | 10.892748 | 1754.476189 | 965.090546 | 3350.053711 | 1701.814132 | 0.096146 | 0.110308 | 0.095815 | 0.328043 | 0.306033 |
1 | 1037 | ITS | angry | XX | 3.003039 | 22050 | -346.40980 | 83.344710 | 125.381540 | 42.769300 | ... | 10.813552 | 1624.501830 | 1058.895061 | 3325.968863 | 1722.048847 | 0.038797 | 0.031771 | 0.091384 | 0.370311 | 0.322135 |
2 | 1060 | ITS | neutral | XX | 2.402404 | 22050 | -421.48450 | 36.981285 | 140.371900 | 19.000977 | ... | 5.230837 | 1406.515534 | 563.173067 | 3186.085862 | 1848.645713 | 0.008990 | 0.005157 | 0.060317 | 0.403894 | 0.303255 |
3 | 1075 | ITS | neutral | XX | 2.435782 | 22050 | -413.22550 | 50.740425 | 140.576370 | 28.177567 | ... | 5.935963 | 1370.133893 | 682.471987 | 3006.958008 | 1780.081965 | 0.011599 | 0.007132 | 0.062360 | 0.414367 | 0.308347 |
4 | 1073 | IOM | disgust | XX | 2.869569 | 22050 | -415.93317 | 61.064003 | 136.759430 | 19.811580 | ... | 6.920853 | 1164.595733 | 422.881734 | 2682.273028 | 1686.101546 | 0.014929 | 0.013538 | 0.041378 | 0.411929 | 0.306035 |
5 rows × 41 columns
Dataset Information
<class 'pandas.core.frame.DataFrame'>
Index: 7442 entries, 0 to 7441
Data columns (total 41 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 actor_id 7442 non-null int64
1 sentence 7442 non-null object
2 emotion 7442 non-null object
3 intensity 7442 non-null object
4 audio_duration 7442 non-null float64
5 sample_rate 7442 non-null int64
6 mfcc_1_mean 7442 non-null float64
7 mfcc_1_std 7442 non-null float64
8 mfcc_2_mean 7442 non-null float64
9 mfcc_2_std 7442 non-null float64
10 mfcc_3_mean 7442 non-null float64
11 mfcc_3_std 7442 non-null float64
12 mfcc_4_mean 7442 non-null float64
13 mfcc_4_std 7442 non-null float64
14 mfcc_5_mean 7442 non-null float64
15 mfcc_5_std 7442 non-null float64
16 mfcc_6_mean 7442 non-null float64
17 mfcc_6_std 7442 non-null float64
18 mfcc_7_mean 7442 non-null float64
19 mfcc_7_std 7442 non-null float64
20 mfcc_8_mean 7442 non-null float64
21 mfcc_8_std 7442 non-null float64
22 mfcc_9_mean 7442 non-null float64
23 mfcc_9_std 7442 non-null float64
24 mfcc_10_mean 7442 non-null float64
25 mfcc_10_std 7442 non-null float64
26 mfcc_11_mean 7442 non-null float64
27 mfcc_11_std 7442 non-null float64
28 mfcc_12_mean 7442 non-null float64
29 mfcc_12_std 7442 non-null float64
30 mfcc_13_mean 7442 non-null float64
31 mfcc_13_std 7442 non-null float64
32 spectral_centroid_mean 7442 non-null float64
33 spectral_centroid_std 7442 non-null float64
34 spectral_rolloff_mean 7442 non-null float64
35 spectral_bandwidth_mean 7442 non-null float64
36 rms_mean 7442 non-null float64
37 rms_std 7442 non-null float64
38 zcr_mean 7442 non-null float64
39 chroma_mean 7442 non-null float64
40 chroma_std 7442 non-null float64
dtypes: float64(36), int64(2), object(3)
memory usage: 2.4+ MB
Target Frequency
Intensity Frequency
Spectral Centroid Distribution
Root Mean Square Distribution
Zero Crossing Rate Distribution
Croma Distribution
Research Questions
Question 1: Basic Emotion Classification
Can we accurately classify four emotions (happy, sad, angry, fear) from audio features using traditional machine learning algorithms?
Question 2: Ensemble Learning Effectiveness
Can combining multiple machine learning algorithms (ensemble methods) significantly improve emotion classification accuracy compared to individual models, and which ensemble strategies work best for acoustic emotion recognition?
Analysis Plan
Question 1: Basic Emotion Classification
Target Variable: emotion
Model Implementation: Three complementary algorithms will be implemented and compared:
Model | Description |
---|---|
Random Forest | Offers robust performance with built-in feature importance rankings and handling of non-linear relationships |
Support Vector Machine | Excels with high-dimensional feature spaces common in audio analysis |
Multi Layer Perceptron | notable for being able to distinguish data that is not linearly separable |
Evaluation Framework: Models will be assessed using train/test split methodology with comprehensive metrics including accuracy, precision, recall, and F1-score for each emotion class, providing detailed performance analysis across emotional categories.
Question 2: Ensemble Learning Effectiveness
Analysis Approach:
Base Model Training: Train all three algorithms (Random Forest, SVM, MLP) separately using identical feature sets and training data, establishing baseline performance metrics for each individual approach.
Performance Comparison: Conduct systematic comparison of individual model accuracy against ensemble methods using cross-validation, statistical significance testing, and detailed performance metrics to quantify improvement gains.
Technical Implementation
Programming Environment: Python with scientific computing stack
Key Libraries
Library | Purpose |
---|---|
librosa |
Audio feature extraction and processing |
scikit-learn |
Machine learning algorithms and evaluation |
pandas / numpy |
Data manipulation and numerical computation |
matplotlib / seaborn |
Visualization and results presentation |
Expected Project Timeline (3~ weeks)
Week 0: Data preparation and exploration
- Download CREMA-D from Kaggle (~2GB)
- Explore dataset structure and file naming
- Proposal write-up and review
Week 1: Feature extraction and dataset creation
- Implement feature extraction pipeline
- Process selected audio files
- Create clean feature dataset
- Exploratory data analysis of features vs emotions
Week 2: Individual model development
- Train baseline models (Logistic Regression, Decision Tree, SVM)
- Hyperparameter tuning using GridSearchCV
- Performance evaluation and comparison
- Feature importance analysis
Week 3: Ensemble methods and final analysis
- Implement ensemble approaches
- Compare individual vs ensemble performance
- Statistical significance testing
- Final report and presentation
Project Structure
Folder / File Name | Description |
---|---|
.quarto/ |
Quarto’s internal folder—automatically created to manage rendering settings and cache. You typically don’t touch this. |
_extra/ |
Stores supporting materials that aren’t part of the main outputs but are useful for context. |
_freeze/ |
Keeps locked-in versions of documents to ensure consistency when rebuilding or sharing. |
data/ |
Where all the data lives—raw inputs, cleaned datasets, and a README explaining structure and sources. |
images/ |
Used for storing visual content like charts, graphs, and illustrations referenced in your .qmd files. |
style/ |
Contains custom design elements, like SCSS files, that control the look and feel of your site. |
index.qmd |
Acts as the homepage, giving a snapshot of what the project is about. |
about.qmd |
Gives extra context—background info, author bio, or detailed project narrative. |
proposal.qmd |
The full research plan: includes goals, methods, schedule, and how everything is structured. |
presentation.qmd |
Slide deck made with Quarto to highlight the most important insights from your project. |
Work Cited
Banerjee, Gaurab, et al. Understanding Emotion Classification in Audio Data Stanford CS224N Custom Project.
Livingstone, Steven R., and Frank A. Russo. “The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS): A Dynamic, Multimodal Set of Facial and Vocal Expressions in North American English.” PLOS ONE, vol. 13, no. 5, 16 May 2018, p. e0196391, www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio, https://doi.org/10.1371/journal.pone.0196391. Accessed 4 Aug. 2025.
Lok, Eu Jin. “CREMA-D.” Kaggle.com, 2019, www.kaggle.com/datasets/ejlok1/cremad. Accessed 4 Aug. 2025.
Moataz El Ayadi, et al. “Survey on Speech Emotion Recognition: Features, Classification Schemes, and Databases.” Pattern Recognition, vol. 44, no. 3, 14 Oct. 2010, pp. 572–587, ui.adsabs.harvard.edu/abs/2011PatRe..44..572E/abstract, https://doi.org/10.1016/j.patcog.2010.09.020. Accessed 19 Aug. 2025.