""" IEMOCAP feature extraction script. Expected dataset structure: $DATA_ROOT/IEMOCAP_full_release/ Session1/ ... Session5/ dialog/ EmoEvaluation/ -> label files (.txt) transcriptions/ -> transcript files (.txt) wav/ -> utterance wav files (Session1_F_improvised_001_F000.wav, ...) Output: $ZSY/multimodal_affect/data/iemocap/ {train,val,test}_text.npy shape: (N, seq_len) token ids (or (N, 768) if model available) {train,val,test}_audio.npy shape: (N, 40) MFCC means {train,val,test}_labels.npy shape: (N,) int labels label_map.json """ import os import re import json import wave import struct import argparse import numpy as np from pathlib import Path from typing import Optional # ── constants ────────────────────────────────────────────────────────────── EMOTION_MAP = {"ang": 0, "hap": 1, "exc": 1, "sad": 2, "neu": 3} # exc merged into hap SESSIONS = ["Session1", "Session2", "Session3", "Session4", "Session5"] LABEL_NAMES = ["angry", "happy", "sad", "neutral"] SAMPLE_RATE = 16000 N_MFCC = 40 SEED = 42 # ── audio utilities (no libsndfile needed) ───────────────────────────────── def _load_wav_stdlib(path: str): """Load WAV with stdlib wave module → float32 mono array.""" with wave.open(path, "rb") as f: n_channels = f.getnchannels() sampwidth = f.getsampwidth() n_frames = f.getnframes() raw = f.readframes(n_frames) if sampwidth == 2: samples = np.frombuffer(raw, dtype=np.int16).astype(np.float32) / 32768.0 elif sampwidth == 4: samples = np.frombuffer(raw, dtype=np.int32).astype(np.float32) / 2147483648.0 else: raise ValueError(f"Unsupported sample width: {sampwidth}") if n_channels > 1: samples = samples.reshape(-1, n_channels).mean(axis=1) return samples def _load_audio(path: str): """Try av → stdlib wave, return float32 mono array.""" try: import av container = av.open(path) stream = next(s for s in container.streams if s.type == "audio") chunks = [] for packet in container.demux(stream): for frame in packet.decode(): arr = frame.to_ndarray() if arr.ndim == 2: arr = arr.mean(axis=0) chunks.append(arr.astype(np.float32)) container.close() if chunks: return np.concatenate(chunks) except Exception: pass return _load_wav_stdlib(path) # ── MFCC via DCT (no librosa fallback if soundfile missing) ─────────────── def _framing(signal, frame_len, hop_len): n_frames = 1 + (len(signal) - frame_len) // hop_len idx = np.arange(frame_len)[None, :] + hop_len * np.arange(n_frames)[:, None] return signal[idx] def compute_mfcc(signal: np.ndarray, sr: int = SAMPLE_RATE, n_mfcc: int = N_MFCC, n_fft: int = 512, hop_length: int = 160, n_mels: int = 40) -> np.ndarray: """Minimal MFCC without librosa/soundfile dependency.""" try: import librosa # librosa may use audioread / av backend mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels) return mfcc.T # (T, n_mfcc) except Exception: pass # pure-numpy fallback frame_len = n_fft frames = _framing(signal, frame_len, hop_len=hop_length) window = np.hanning(frame_len) frames = frames * window[None, :] mag = np.abs(np.fft.rfft(frames, n=n_fft)) freqs = np.fft.rfftfreq(n_fft, d=1.0 / sr) # mel filterbank def hz2mel(f): return 2595 * np.log10(1 + f / 700) def mel2hz(m): return 700 * (10 ** (m / 2595) - 1) mel_low, mel_high = hz2mel(80), hz2mel(sr / 2) mel_pts = np.linspace(mel_low, mel_high, n_mels + 2) hz_pts = mel2hz(mel_pts) bins = np.floor((n_fft + 1) * hz_pts / sr).astype(int) fbank = np.zeros((n_mels, n_fft // 2 + 1)) for m in range(1, n_mels + 1): lo, ctr, hi = bins[m - 1], bins[m], bins[m + 1] fbank[m - 1, lo:ctr] = (np.arange(lo, ctr) - lo) / (ctr - lo + 1e-8) fbank[m - 1, ctr:hi] = (hi - np.arange(ctr, hi)) / (hi - ctr + 1e-8) mel_energy = np.dot(mag ** 2, fbank.T) log_mel = np.log(np.maximum(mel_energy, 1e-9)) # DCT-II n = np.arange(n_mfcc)[:, None] k = np.arange(n_mels)[None, :] dct = np.cos(np.pi * n * (2 * k + 1) / (2 * n_mels)) mfcc = np.dot(log_mel, dct.T) return mfcc # (T, n_mfcc) def mfcc_features(wav_path: str) -> np.ndarray: """Return mean MFCC over time → shape (n_mfcc,).""" sig = _load_audio(wav_path) mfcc = compute_mfcc(sig) return mfcc.mean(axis=0) # ── text tokenisation ────────────────────────────────────────────────────── def get_text_features(text: str, tokenizer=None, model=None, max_len: int = 64) -> np.ndarray: """Return [CLS] embedding (768-d) or BoW int vector (max_len,).""" if tokenizer is not None and model is not None: import torch enc = tokenizer(text, return_tensors="pt", truncation=True, max_length=max_len, padding="max_length") with torch.no_grad(): out = model(**enc) return out.last_hidden_state[:, 0, :].squeeze(0).cpu().numpy() # simple token-id fallback (word hash) tokens = text.lower().split()[:max_len] ids = [hash(t) % 30522 for t in tokens] ids += [0] * (max_len - len(ids)) return np.array(ids, dtype=np.int32) # ── label parsing ────────────────────────────────────────────────────────── def parse_label_file(label_path: str) -> dict: """Return dict: utterance_id → emotion string.""" labels = {} with open(label_path, encoding="utf-8") as f: for line in f: if line.startswith("["): parts = line.strip().split("\t") if len(parts) >= 2: uid = parts[1].strip() emo = parts[2].strip().lower() if len(parts) > 2 else "xxx" labels[uid] = emo return labels def parse_transcription_file(trans_path: str) -> dict: """Return dict: utterance_id → text.""" texts = {} with open(trans_path, encoding="utf-8") as f: for line in f: m = re.match(r"^(\w+)\s*\[.*?\]\s*:\s*(.+)$", line.strip()) if m: texts[m.group(1)] = m.group(2).strip() return texts # ── main extraction ──────────────────────────────────────────────────────── def extract_iemocap(data_root: str, out_dir: str, use_transformer: bool = False, model_name: str = "roberta-base", val_sessions: list = None, test_sessions: list = None): data_root = Path(data_root) out_dir = Path(out_dir) out_dir.mkdir(parents=True, exist_ok=True) if val_sessions is None: val_sessions = ["Session4"] if test_sessions is None: test_sessions = ["Session5"] tokenizer, model = None, None if use_transformer: from transformers import AutoTokenizer, AutoModel print(f"Loading {model_name} …") tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModel.from_pretrained(model_name) model.eval() splits = {"train": [], "val": [], "test": []} for session in SESSIONS: sess_dir = data_root / "IEMOCAP_full_release" / session if not sess_dir.exists(): print(f" [skip] {sess_dir} not found") continue emo_dir = sess_dir / "dialog" / "EmoEvaluation" trans_dir = sess_dir / "dialog" / "transcriptions" wav_dir = sess_dir / "sentences" / "wav" if session in test_sessions: split = "test" elif session in val_sessions: split = "val" else: split = "train" for label_file in sorted(emo_dir.glob("*.txt")): labels = parse_label_file(str(label_file)) dialog_id = label_file.stem trans_file = trans_dir / (dialog_id + ".txt") texts = parse_transcription_file(str(trans_file)) if trans_file.exists() else {} for uid, emo in labels.items(): if emo not in EMOTION_MAP: continue label = EMOTION_MAP[emo] text = texts.get(uid, "") wav_path = wav_dir / dialog_id / (uid + ".wav") if not wav_path.exists(): continue try: audio_feat = mfcc_features(str(wav_path)) text_feat = get_text_features(text, tokenizer, model) splits[split].append((text_feat, audio_feat, label)) except Exception as e: print(f" [warn] {uid}: {e}") print(f" {session} → {split}: {len(splits[split])} so far") label_map = {i: name for i, name in enumerate(LABEL_NAMES)} with open(out_dir / "label_map.json", "w") as f: json.dump(label_map, f, indent=2) for split, items in splits.items(): if not items: print(f" [warn] {split} is empty") continue text_arr = np.stack([x[0] for x in items]) audio_arr = np.stack([x[1] for x in items]) label_arr = np.array([x[2] for x in items], dtype=np.int64) np.save(out_dir / f"{split}_text.npy", text_arr) np.save(out_dir / f"{split}_audio.npy", audio_arr) np.save(out_dir / f"{split}_labels.npy", label_arr) print(f" Saved {split}: text {text_arr.shape}, audio {audio_arr.shape}, labels {label_arr.shape}") print("Done →", out_dir) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--data_root", required=True, help="Parent dir containing IEMOCAP_full_release/") parser.add_argument("--out_dir", default=None) parser.add_argument("--use_transformer", action="store_true") parser.add_argument("--model_name", default="roberta-base") args = parser.parse_args() zsy = os.environ.get("ZSY", "/root") out_dir = args.out_dir or f"{zsy}/multimodal_affect/data/iemocap" extract_iemocap(args.data_root, out_dir, use_transformer=args.use_transformer, model_name=args.model_name)