""" P0-4: Multimodal noise generation for robustness experiments. Supports three modalities: text, audio, visual. Each modality has configurable noise types and intensity levels. Usage: python generate_noise.py --config configs/noise_configs.yaml \ --data_dir $ZSY/multimodal_affect/data/iemocap \ --out_dir $ZSY/multimodal_affect/data/iemocap_noisy Config schema → see configs/noise_configs.yaml """ import os import json import argparse import yaml import numpy as np from pathlib import Path from typing import Dict, Any, Optional RNG = np.random.default_rng(42) # ═══════════════════════════════════════════════════════ # TEXT NOISE # ═══════════════════════════════════════════════════════ def _word_drop(ids: np.ndarray, drop_rate: float) -> np.ndarray: """Randomly zero-out token ids (simulates word deletion).""" mask = RNG.random(ids.shape) < drop_rate return np.where(mask, 0, ids) def _word_swap(ids: np.ndarray, swap_rate: float) -> np.ndarray: """Randomly shuffle adjacent tokens.""" ids = ids.copy() n = len(ids) for i in range(n - 1): if RNG.random() < swap_rate: ids[i], ids[i + 1] = ids[i + 1], ids[i] return ids def _random_replace(ids: np.ndarray, replace_rate: float, vocab_size: int = 30522) -> np.ndarray: """Replace tokens with random vocab ids.""" ids = ids.copy() mask = RNG.random(ids.shape) < replace_rate rand_ids = RNG.integers(1, vocab_size, size=ids.shape) return np.where(mask & (ids != 0), rand_ids, ids) def add_text_noise(features: np.ndarray, cfg: Dict) -> np.ndarray: """Apply text noise to an array of token-id features (N, seq_len).""" noise_type = cfg.get("type", "word_drop") intensity = float(cfg.get("intensity", 0.1)) if noise_type == "word_drop": return np.stack([_word_drop(row, intensity) for row in features]) if noise_type == "word_swap": return np.stack([_word_swap(row, intensity) for row in features]) if noise_type == "random_replace": return np.stack([_random_replace(row, intensity) for row in features]) if noise_type == "gaussian": # for embedding features (N, dim) not token ids noise = RNG.standard_normal(features.shape).astype(np.float32) return features + intensity * noise raise ValueError(f"Unknown text noise type: {noise_type}") # ═══════════════════════════════════════════════════════ # AUDIO NOISE # ═══════════════════════════════════════════════════════ def add_audio_noise(features: np.ndarray, cfg: Dict) -> np.ndarray: """Apply noise to audio feature matrix (N, n_mfcc).""" noise_type = cfg.get("type", "gaussian") intensity = float(cfg.get("intensity", 0.05)) if noise_type == "gaussian": noise = RNG.standard_normal(features.shape).astype(np.float32) return features + intensity * noise * features.std(axis=0, keepdims=True) if noise_type == "masking": # mask entire feature dimensions (simulates missing mic) features = features.copy() n_mask = max(1, int(features.shape[1] * intensity)) dims = RNG.choice(features.shape[1], n_mask, replace=False) features[:, dims] = 0.0 return features if noise_type == "time_mask": # mask random samples (simulates packet loss for temporal features) features = features.copy() n_mask = max(1, int(features.shape[0] * intensity)) rows = RNG.choice(features.shape[0], n_mask, replace=False) features[rows, :] = 0.0 return features if noise_type == "scale": # random amplitude scaling scale = 1.0 + intensity * (RNG.random(features.shape[0]) - 0.5) * 2 return features * scale[:, None] raise ValueError(f"Unknown audio noise type: {noise_type}") # ═══════════════════════════════════════════════════════ # VISUAL NOISE (operates on feature vectors, not pixels) # ═══════════════════════════════════════════════════════ def add_visual_noise(features: np.ndarray, cfg: Dict) -> np.ndarray: """Apply noise to visual feature matrix (N, feat_dim).""" noise_type = cfg.get("type", "gaussian") intensity = float(cfg.get("intensity", 0.1)) if noise_type == "gaussian": noise = RNG.standard_normal(features.shape).astype(np.float32) return features + intensity * noise if noise_type == "dropout": mask = (RNG.random(features.shape) > intensity).astype(np.float32) return features * mask if noise_type == "occlusion": # zero out a contiguous block of feature dims features = features.copy() start = RNG.integers(0, max(1, features.shape[1] - 1)) length = max(1, int(features.shape[1] * intensity)) features[:, start:start + length] = 0.0 return features if noise_type == "missing_modality": # simulate completely missing video frames features = features.copy() n_missing = max(1, int(len(features) * intensity)) idx = RNG.choice(len(features), n_missing, replace=False) features[idx, :] = 0.0 return features raise ValueError(f"Unknown visual noise type: {noise_type}") # ═══════════════════════════════════════════════════════ # COMBINED MULTIMODAL NOISE # ═══════════════════════════════════════════════════════ MODALITY_SPECS = [ ("text", ("text",), add_text_noise), ("audio", ("audio",), add_audio_noise), # Dataset files use *_vision.npy. Older configs used "visual", so keep it # as an input alias but always write the canonical "vision" filename. ("vision", ("vision", "visual"), add_visual_noise), ] def _get_modality_cfg(noise_cfg: Dict, aliases: tuple) -> Dict: for name in aliases: if name in noise_cfg: return noise_cfg[name] return noise_cfg.get("default", {}) def apply_noise_config(data_dir: Path, out_dir: Path, noise_cfg: Dict, splits: list = None): """Apply noise config to all splits and modalities found in data_dir.""" if splits is None: splits = ["train", "val", "test"] out_dir.mkdir(parents=True, exist_ok=True) for split in splits: for modality, aliases, fn in MODALITY_SPECS: src = data_dir / f"{split}_{modality}.npy" if not src.exists(): continue features = np.load(str(src)) mod_cfg = _get_modality_cfg(noise_cfg, aliases) if mod_cfg: noisy = fn(features.astype(np.float32), mod_cfg) else: noisy = features.astype(np.float32).copy() dst = out_dir / f"{split}_{modality}.npy" np.save(str(dst), noisy) print(f" {split}/{modality}: {features.shape} → {dst.name}") # copy labels unchanged label_src = data_dir / f"{split}_labels.npy" if label_src.exists(): import shutil shutil.copy2(str(label_src), str(out_dir / f"{split}_labels.npy")) # copy metadata for meta_file in ["label_map.json", "meta.json"]: src = data_dir / meta_file if src.exists(): import shutil shutil.copy2(str(src), str(out_dir / meta_file)) def generate_noise_variants(data_dir: str, out_base: str, config: Dict): """Generate multiple noise variants as defined in config.""" data_dir = Path(data_dir) out_base = Path(out_base) variants = config.get("variants", []) if not variants: # single-variant mode: apply config directly apply_noise_config(data_dir, out_base, config.get("noise", {})) return for variant in variants: name = variant["name"] noise_cfg = variant["noise"] out_dir = out_base / name print(f"\n[Variant: {name}]") apply_noise_config(data_dir, out_dir, noise_cfg) with open(out_dir / "noise_config.json", "w") as f: json.dump(variant, f, indent=2) print(f"\nAll variants saved under {out_base}") if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--config", required=True, help="Path to noise_configs.yaml") parser.add_argument("--data_dir", required=True, help="Dir with {split}_{modality}.npy files") parser.add_argument("--out_dir", default=None, help="Output base dir (default: data_dir + '_noisy')") args = parser.parse_args() with open(args.config, encoding="utf-8") as f: config = yaml.safe_load(f) zsy = os.environ.get("ZSY", "/root") out_dir = args.out_dir or args.data_dir.rstrip("/") + "_noisy" generate_noise_variants(args.data_dir, out_dir, config)