288 lines
11 KiB
Python
288 lines
11 KiB
Python
|
|
"""
|
||
|
|
IEMOCAP feature extraction script.
|
||
|
|
|
||
|
|
Expected dataset structure:
|
||
|
|
$DATA_ROOT/IEMOCAP_full_release/
|
||
|
|
Session1/ ... Session5/
|
||
|
|
dialog/
|
||
|
|
EmoEvaluation/ -> label files (.txt)
|
||
|
|
transcriptions/ -> transcript files (.txt)
|
||
|
|
wav/ -> utterance wav files (Session1_F_improvised_001_F000.wav, ...)
|
||
|
|
|
||
|
|
Output: $ZSY/multimodal_affect/data/iemocap/
|
||
|
|
{train,val,test}_text.npy shape: (N, seq_len) token ids (or (N, 768) if model available)
|
||
|
|
{train,val,test}_audio.npy shape: (N, 40) MFCC means
|
||
|
|
{train,val,test}_labels.npy shape: (N,) int labels
|
||
|
|
label_map.json
|
||
|
|
"""
|
||
|
|
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import json
|
||
|
|
import wave
|
||
|
|
import struct
|
||
|
|
import argparse
|
||
|
|
import numpy as np
|
||
|
|
from pathlib import Path
|
||
|
|
from typing import Optional
|
||
|
|
|
||
|
|
# ── constants ──────────────────────────────────────────────────────────────
|
||
|
|
EMOTION_MAP = {"ang": 0, "hap": 1, "exc": 1, "sad": 2, "neu": 3} # exc merged into hap
|
||
|
|
SESSIONS = ["Session1", "Session2", "Session3", "Session4", "Session5"]
|
||
|
|
LABEL_NAMES = ["angry", "happy", "sad", "neutral"]
|
||
|
|
SAMPLE_RATE = 16000
|
||
|
|
N_MFCC = 40
|
||
|
|
SEED = 42
|
||
|
|
|
||
|
|
|
||
|
|
# ── audio utilities (no libsndfile needed) ─────────────────────────────────
|
||
|
|
def _load_wav_stdlib(path: str):
|
||
|
|
"""Load WAV with stdlib wave module → float32 mono array."""
|
||
|
|
with wave.open(path, "rb") as f:
|
||
|
|
n_channels = f.getnchannels()
|
||
|
|
sampwidth = f.getsampwidth()
|
||
|
|
n_frames = f.getnframes()
|
||
|
|
raw = f.readframes(n_frames)
|
||
|
|
|
||
|
|
if sampwidth == 2:
|
||
|
|
samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0
|
||
|
|
elif sampwidth == 4:
|
||
|
|
samples = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0
|
||
|
|
else:
|
||
|
|
raise ValueError(f"Unsupported sample width: {sampwidth}")
|
||
|
|
|
||
|
|
if n_channels > 1:
|
||
|
|
samples = samples.reshape(-1, n_channels).mean(axis=1)
|
||
|
|
return samples
|
||
|
|
|
||
|
|
|
||
|
|
def _load_audio(path: str):
|
||
|
|
"""Try av → stdlib wave, return float32 mono array."""
|
||
|
|
try:
|
||
|
|
import av
|
||
|
|
container = av.open(path)
|
||
|
|
stream = next(s for s in container.streams if s.type == "audio")
|
||
|
|
chunks = []
|
||
|
|
for packet in container.demux(stream):
|
||
|
|
for frame in packet.decode():
|
||
|
|
arr = frame.to_ndarray()
|
||
|
|
if arr.ndim == 2:
|
||
|
|
arr = arr.mean(axis=0)
|
||
|
|
chunks.append(arr.astype(np.float32))
|
||
|
|
container.close()
|
||
|
|
if chunks:
|
||
|
|
return np.concatenate(chunks)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
return _load_wav_stdlib(path)
|
||
|
|
|
||
|
|
|
||
|
|
# ── MFCC via DCT (no librosa fallback if soundfile missing) ───────────────
|
||
|
|
def _framing(signal, frame_len, hop_len):
|
||
|
|
n_frames = 1 + (len(signal) - frame_len) // hop_len
|
||
|
|
idx = np.arange(frame_len)[None, :] + hop_len * np.arange(n_frames)[:, None]
|
||
|
|
return signal[idx]
|
||
|
|
|
||
|
|
|
||
|
|
def compute_mfcc(signal: np.ndarray, sr: int = SAMPLE_RATE,
|
||
|
|
n_mfcc: int = N_MFCC, n_fft: int = 512,
|
||
|
|
hop_length: int = 160, n_mels: int = 40) -> np.ndarray:
|
||
|
|
"""Minimal MFCC without librosa/soundfile dependency."""
|
||
|
|
try:
|
||
|
|
import librosa
|
||
|
|
# librosa may use audioread / av backend
|
||
|
|
mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc,
|
||
|
|
n_fft=n_fft, hop_length=hop_length,
|
||
|
|
n_mels=n_mels)
|
||
|
|
return mfcc.T # (T, n_mfcc)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# pure-numpy fallback
|
||
|
|
frame_len = n_fft
|
||
|
|
frames = _framing(signal, frame_len, hop_len=hop_length)
|
||
|
|
window = np.hanning(frame_len)
|
||
|
|
frames = frames * window[None, :]
|
||
|
|
|
||
|
|
mag = np.abs(np.fft.rfft(frames, n=n_fft))
|
||
|
|
freqs = np.fft.rfftfreq(n_fft, d=1.0 / sr)
|
||
|
|
|
||
|
|
# mel filterbank
|
||
|
|
def hz2mel(f): return 2595 * np.log10(1 + f / 700)
|
||
|
|
def mel2hz(m): return 700 * (10 ** (m / 2595) - 1)
|
||
|
|
mel_low, mel_high = hz2mel(80), hz2mel(sr / 2)
|
||
|
|
mel_pts = np.linspace(mel_low, mel_high, n_mels + 2)
|
||
|
|
hz_pts = mel2hz(mel_pts)
|
||
|
|
bins = np.floor((n_fft + 1) * hz_pts / sr).astype(int)
|
||
|
|
|
||
|
|
fbank = np.zeros((n_mels, n_fft // 2 + 1))
|
||
|
|
for m in range(1, n_mels + 1):
|
||
|
|
lo, ctr, hi = bins[m - 1], bins[m], bins[m + 1]
|
||
|
|
fbank[m - 1, lo:ctr] = (np.arange(lo, ctr) - lo) / (ctr - lo + 1e-8)
|
||
|
|
fbank[m - 1, ctr:hi] = (hi - np.arange(ctr, hi)) / (hi - ctr + 1e-8)
|
||
|
|
|
||
|
|
mel_energy = np.dot(mag ** 2, fbank.T)
|
||
|
|
log_mel = np.log(np.maximum(mel_energy, 1e-9))
|
||
|
|
|
||
|
|
# DCT-II
|
||
|
|
n = np.arange(n_mfcc)[:, None]
|
||
|
|
k = np.arange(n_mels)[None, :]
|
||
|
|
dct = np.cos(np.pi * n * (2 * k + 1) / (2 * n_mels))
|
||
|
|
mfcc = np.dot(log_mel, dct.T)
|
||
|
|
return mfcc # (T, n_mfcc)
|
||
|
|
|
||
|
|
|
||
|
|
def mfcc_features(wav_path: str) -> np.ndarray:
|
||
|
|
"""Return mean MFCC over time → shape (n_mfcc,)."""
|
||
|
|
sig = _load_audio(wav_path)
|
||
|
|
mfcc = compute_mfcc(sig)
|
||
|
|
return mfcc.mean(axis=0)
|
||
|
|
|
||
|
|
|
||
|
|
# ── text tokenisation ──────────────────────────────────────────────────────
|
||
|
|
def get_text_features(text: str, tokenizer=None, model=None,
|
||
|
|
max_len: int = 64) -> np.ndarray:
|
||
|
|
"""Return [CLS] embedding (768-d) or BoW int vector (max_len,)."""
|
||
|
|
if tokenizer is not None and model is not None:
|
||
|
|
import torch
|
||
|
|
enc = tokenizer(text, return_tensors="pt", truncation=True,
|
||
|
|
max_length=max_len, padding="max_length")
|
||
|
|
with torch.no_grad():
|
||
|
|
out = model(**enc)
|
||
|
|
return out.last_hidden_state[:, 0, :].squeeze(0).cpu().numpy()
|
||
|
|
|
||
|
|
# simple token-id fallback (word hash)
|
||
|
|
tokens = text.lower().split()[:max_len]
|
||
|
|
ids = [hash(t) % 30522 for t in tokens]
|
||
|
|
ids += [0] * (max_len - len(ids))
|
||
|
|
return np.array(ids, dtype=np.int32)
|
||
|
|
|
||
|
|
|
||
|
|
# ── label parsing ──────────────────────────────────────────────────────────
|
||
|
|
def parse_label_file(label_path: str) -> dict:
|
||
|
|
"""Return dict: utterance_id → emotion string."""
|
||
|
|
labels = {}
|
||
|
|
with open(label_path, encoding="utf-8") as f:
|
||
|
|
for line in f:
|
||
|
|
if line.startswith("["):
|
||
|
|
parts = line.strip().split("\t")
|
||
|
|
if len(parts) >= 2:
|
||
|
|
uid = parts[1].strip()
|
||
|
|
emo = parts[2].strip().lower() if len(parts) > 2 else "xxx"
|
||
|
|
labels[uid] = emo
|
||
|
|
return labels
|
||
|
|
|
||
|
|
|
||
|
|
def parse_transcription_file(trans_path: str) -> dict:
|
||
|
|
"""Return dict: utterance_id → text."""
|
||
|
|
texts = {}
|
||
|
|
with open(trans_path, encoding="utf-8") as f:
|
||
|
|
for line in f:
|
||
|
|
m = re.match(r"^(\w+)\s*\[.*?\]\s*:\s*(.+)$", line.strip())
|
||
|
|
if m:
|
||
|
|
texts[m.group(1)] = m.group(2).strip()
|
||
|
|
return texts
|
||
|
|
|
||
|
|
|
||
|
|
# ── main extraction ────────────────────────────────────────────────────────
|
||
|
|
def extract_iemocap(data_root: str, out_dir: str,
|
||
|
|
use_transformer: bool = False,
|
||
|
|
model_name: str = "roberta-base",
|
||
|
|
val_sessions: list = None,
|
||
|
|
test_sessions: list = None):
|
||
|
|
data_root = Path(data_root)
|
||
|
|
out_dir = Path(out_dir)
|
||
|
|
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
if val_sessions is None:
|
||
|
|
val_sessions = ["Session4"]
|
||
|
|
if test_sessions is None:
|
||
|
|
test_sessions = ["Session5"]
|
||
|
|
|
||
|
|
tokenizer, model = None, None
|
||
|
|
if use_transformer:
|
||
|
|
from transformers import AutoTokenizer, AutoModel
|
||
|
|
print(f"Loading {model_name} …")
|
||
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||
|
|
model = AutoModel.from_pretrained(model_name)
|
||
|
|
model.eval()
|
||
|
|
|
||
|
|
splits = {"train": [], "val": [], "test": []}
|
||
|
|
|
||
|
|
for session in SESSIONS:
|
||
|
|
sess_dir = data_root / "IEMOCAP_full_release" / session
|
||
|
|
if not sess_dir.exists():
|
||
|
|
print(f" [skip] {sess_dir} not found")
|
||
|
|
continue
|
||
|
|
|
||
|
|
emo_dir = sess_dir / "dialog" / "EmoEvaluation"
|
||
|
|
trans_dir = sess_dir / "dialog" / "transcriptions"
|
||
|
|
wav_dir = sess_dir / "sentences" / "wav"
|
||
|
|
|
||
|
|
if session in test_sessions:
|
||
|
|
split = "test"
|
||
|
|
elif session in val_sessions:
|
||
|
|
split = "val"
|
||
|
|
else:
|
||
|
|
split = "train"
|
||
|
|
|
||
|
|
for label_file in sorted(emo_dir.glob("*.txt")):
|
||
|
|
labels = parse_label_file(str(label_file))
|
||
|
|
dialog_id = label_file.stem
|
||
|
|
|
||
|
|
trans_file = trans_dir / (dialog_id + ".txt")
|
||
|
|
texts = parse_transcription_file(str(trans_file)) if trans_file.exists() else {}
|
||
|
|
|
||
|
|
for uid, emo in labels.items():
|
||
|
|
if emo not in EMOTION_MAP:
|
||
|
|
continue
|
||
|
|
label = EMOTION_MAP[emo]
|
||
|
|
text = texts.get(uid, "")
|
||
|
|
|
||
|
|
wav_path = wav_dir / dialog_id / (uid + ".wav")
|
||
|
|
if not wav_path.exists():
|
||
|
|
continue
|
||
|
|
|
||
|
|
try:
|
||
|
|
audio_feat = mfcc_features(str(wav_path))
|
||
|
|
text_feat = get_text_features(text, tokenizer, model)
|
||
|
|
splits[split].append((text_feat, audio_feat, label))
|
||
|
|
except Exception as e:
|
||
|
|
print(f" [warn] {uid}: {e}")
|
||
|
|
|
||
|
|
print(f" {session} → {split}: {len(splits[split])} so far")
|
||
|
|
|
||
|
|
label_map = {i: name for i, name in enumerate(LABEL_NAMES)}
|
||
|
|
with open(out_dir / "label_map.json", "w") as f:
|
||
|
|
json.dump(label_map, f, indent=2)
|
||
|
|
|
||
|
|
for split, items in splits.items():
|
||
|
|
if not items:
|
||
|
|
print(f" [warn] {split} is empty")
|
||
|
|
continue
|
||
|
|
text_arr = np.stack([x[0] for x in items])
|
||
|
|
audio_arr = np.stack([x[1] for x in items])
|
||
|
|
label_arr = np.array([x[2] for x in items], dtype=np.int64)
|
||
|
|
np.save(out_dir / f"{split}_text.npy", text_arr)
|
||
|
|
np.save(out_dir / f"{split}_audio.npy", audio_arr)
|
||
|
|
np.save(out_dir / f"{split}_labels.npy", label_arr)
|
||
|
|
print(f" Saved {split}: text {text_arr.shape}, audio {audio_arr.shape}, labels {label_arr.shape}")
|
||
|
|
|
||
|
|
print("Done →", out_dir)
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument("--data_root", required=True,
|
||
|
|
help="Parent dir containing IEMOCAP_full_release/")
|
||
|
|
parser.add_argument("--out_dir", default=None)
|
||
|
|
parser.add_argument("--use_transformer", action="store_true")
|
||
|
|
parser.add_argument("--model_name", default="roberta-base")
|
||
|
|
args = parser.parse_args()
|
||
|
|
|
||
|
|
zsy = os.environ.get("ZSY", "/root")
|
||
|
|
out_dir = args.out_dir or f"{zsy}/multimodal_affect/data/iemocap"
|
||
|
|
extract_iemocap(args.data_root, out_dir,
|
||
|
|
use_transformer=args.use_transformer,
|
||
|
|
model_name=args.model_name)
|