Files
CompanionGuard-RL/旧方向信息/scripts/run_eval_ablation.py

297 lines
13 KiB
Python
Raw Normal View History

"""
Upload and launch test evaluation + D1-4 ablation experiments on server.
Uses Stage B v1 checkpoint (best val WF1=0.7291).
"""
import paramiko, warnings
warnings.filterwarnings('ignore')
ZSY = '/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy'
PROJ = ZSY + '/multimodal_affect'
ENV = ZSY + '/envs/multimodal_affect/bin/python'
# ── eval_d1.py ────────────────────────────────────────────────────────────
EVAL_SCRIPT = r'''#!/usr/bin/env python3
"""
Evaluate Direction-1 checkpoint on test set.
Also runs ablation variants: fixed-equal, rl-nonoise, rl-noc (beta=0), rl-nostab (gamma=0).
Usage:
python scripts/eval/eval_d1.py \
--checkpoint outputs/checkpoints/d1_stageB/best_v1.ckpt \
--dataset IEMOCAP \
--gpu 0
"""
import os, sys, argparse, json, csv, logging
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import f1_score, accuracy_score, classification_report
ZSY = os.environ.get("ZSY", "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy")
PROJ = os.path.join(ZSY, "multimodal_affect")
sys.path.insert(0, PROJ)
from src.data.dataset import MultimodalDataset, get_dataloader
from src.models.encoders import MultimodalEncoder
from src.models.classifier import EmotionClassifier
from src.rl.fusion_agent import ModalFusionAgent
from src.rl.reward import compute_reward
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s")
@torch.no_grad()
def predict(encoder, classifier, loader, device, agent=None, fixed_weights=None):
encoder.eval(); classifier.eval()
if agent: agent.eval()
preds, labels_all = [], []
for batch in loader:
text = batch["text"].to(device)
audio = batch["audio"].to(device)
vision = batch["vision"].to(device)
labels = batch["labels"].to(device)
tf, af, vf, confs = encoder(text, audio, vision)
if agent is not None:
noise_est = audio.std(dim=-1, keepdim=True).sigmoid()
state = torch.cat([confs, noise_est], dim=-1)
weights, *_ = agent.get_action_and_value(state)
fused = weights[:, 0:1]*tf + weights[:, 1:2]*af + weights[:, 2:3]*vf
elif fixed_weights is not None:
w = torch.tensor(fixed_weights, device=device).view(1, 3)
fused = w[:, 0:1]*tf + w[:, 1:2]*af + w[:, 2:3]*vf
else:
fused = (tf + af + vf) / 3.0
logits = classifier(fused)
preds.append(logits.argmax(-1).cpu())
labels_all.append(labels.cpu())
p = torch.cat(preds).numpy()
l = torch.cat(labels_all).numpy()
return p, l
@torch.no_grad()
def predict_noisy(encoder, classifier, loader, device, variant_data, agent=None, fixed_weights=None):
"""Run inference with a noisy variant, replacing any modalities it provides."""
encoder.eval(); classifier.eval()
if agent: agent.eval()
preds, labels_all = [], []
arrays = {k: torch.from_numpy(v).float() for k, v in variant_data.items()}
cursor = 0
for batch in loader:
bsz = batch["text"].size(0)
text = (arrays["text"][cursor:cursor+bsz] if "text" in arrays else batch["text"]).to(device)
audio = (arrays["audio"][cursor:cursor+bsz] if "audio" in arrays else batch["audio"]).to(device)
vision = (arrays["vision"][cursor:cursor+bsz] if "vision" in arrays else batch["vision"]).to(device)
cursor += bsz
labels = batch["labels"].to(device)
tf, af, vf, confs = encoder(text, audio, vision)
if agent is not None:
noise_est = audio.std(dim=-1, keepdim=True).sigmoid()
state = torch.cat([confs, noise_est], dim=-1)
weights, *_ = agent.get_action_and_value(state)
fused = weights[:, 0:1]*tf + weights[:, 1:2]*af + weights[:, 2:3]*vf
elif fixed_weights is not None:
w = torch.tensor(fixed_weights, device=device).view(1, 3)
fused = w[:, 0:1]*tf + w[:, 1:2]*af + w[:, 2:3]*vf
else:
fused = (tf + af + vf) / 3.0
logits = classifier(fused)
preds.append(logits.argmax(-1).cpu())
labels_all.append(labels.cpu())
p = torch.cat(preds).numpy()
l = torch.cat(labels_all).numpy()
return p, l
def metrics(preds, labels, split="test"):
wf1 = float(f1_score(labels, preds, average="weighted", zero_division=0))
acc = float(accuracy_score(labels, preds))
return {"split": split, "wf1": round(wf1, 4), "acc": round(acc, 4)}
def load_model(ckpt_path, device):
ckpt = torch.load(ckpt_path, map_location=device)
td, ad, vd = ckpt["text_dim"], ckpt["audio_dim"], ckpt["vision_dim"]
nc = ckpt["num_classes"]
pd = ckpt.get("proj_dim", 1024)
enc = MultimodalEncoder(td, ad, vd, pd)
cls = EmotionClassifier(pd, nc)
enc.load_state_dict(ckpt["encoder"])
cls.load_state_dict(ckpt["classifier"])
enc.to(device).eval(); cls.to(device).eval()
agent = None
if "agent" in ckpt:
agent = ModalFusionAgent(state_dim=4, hidden=128)
agent.load_state_dict(ckpt["agent"])
agent.to(device).eval()
return enc, cls, agent, ckpt
def main():
p = argparse.ArgumentParser()
p.add_argument("--checkpoint", required=True)
p.add_argument("--stage_a_ckpt", default=None,
help="Stage A ckpt for ablations that need encoder+classifier only")
p.add_argument("--dataset", default="IEMOCAP")
p.add_argument("--gpu", default="0")
p.add_argument("--out_json", default=None)
p.add_argument("--out_csv", default=None)
args = p.parse_args()
device = torch.device(f"cuda:{args.gpu}")
data_dir = os.path.join(PROJ, "data", args.dataset.lower())
noise_root = os.path.join(PROJ, "data", f"{args.dataset.lower()}_noisy")
NOISE_VARIANTS = [
"gaussian_light", "gaussian_heavy", "missing_audio",
"missing_visual", "text_word_drop_30", "audio_masking_50",
"realistic_mixed", "audio_time_mask",
]
# Datasets
val_ds = MultimodalDataset(data_dir, "val")
test_ds = MultimodalDataset(data_dir, "test")
val_loader = get_dataloader(val_ds, 128, shuffle=False, drop_last=False)
test_loader = get_dataloader(test_ds, 128, shuffle=False, drop_last=False)
# Load Stage B v1 checkpoint (encoder + classifier + agent)
enc, cls, agent, ckpt = load_model(args.checkpoint, device)
logging.info(f"Loaded: {args.checkpoint} val_wf1={ckpt.get('val_wf1',0):.4f}")
results = {}
# ── 1. Main evaluation: val + test ────────────────────────────────────
logging.info("=== Main Evaluation (Stage B RL-Full) ===")
for split, loader in [("val", val_loader), ("test", test_loader)]:
ds = val_ds if split == "val" else test_ds
preds, labels = predict(enc, cls, loader, device, agent=agent)
m = metrics(preds, labels, split)
results[f"RL-Full_{split}"] = m
logging.info(f" [{split}] WF1={m['wf1']:.4f} Acc={m['acc']:.4f}")
if split == "test":
rpt = classification_report(labels, preds,
target_names=[str(i) for i in range(ckpt["num_classes"])],
zero_division=0)
logging.info(f"\n{rpt}")
# ── 2. Ablation A: Fixed-Equal (uniform weights, Stage B classifier) ──
logging.info("=== Ablation: Fixed-Equal ===")
for split, loader in [("val", val_loader), ("test", test_loader)]:
preds, labels = predict(enc, cls, loader, device,
fixed_weights=[1/3, 1/3, 1/3])
m = metrics(preds, labels, split)
results[f"Fixed-Equal_{split}"] = m
logging.info(f" [{split}] WF1={m['wf1']:.4f} Acc={m['acc']:.4f}")
# ── 3. Ablation B: Stage A only (no RL, trained classifier w/ uniform fusion) ─
if args.stage_a_ckpt:
logging.info("=== Ablation: Stage-A-Only ===")
enc_a, cls_a, _, ckpt_a = load_model(args.stage_a_ckpt, device)
for split, loader in [("val", val_loader), ("test", test_loader)]:
preds, labels = predict(enc_a, cls_a, loader, device)
m = metrics(preds, labels, split)
results[f"StageA-Only_{split}"] = m
logging.info(f" [{split}] WF1={m['wf1']:.4f} Acc={m['acc']:.4f}")
else:
# estimate from Stage A ckpt embedded in Stage B (same encoder/classifier)
# just run with agent=None (uniform fusion) using Stage B encoder+classifier
logging.info("=== Ablation: RL-Agent-Removed (Stage B enc+cls, uniform fusion) ===")
for split, loader in [("val", val_loader), ("test", test_loader)]:
preds, labels = predict(enc, cls, loader, device, agent=None)
m = metrics(preds, labels, split)
results[f"NoRL-UniformFusion_{split}"] = m
logging.info(f" [{split}] WF1={m['wf1']:.4f} Acc={m['acc']:.4f}")
# ── 4. Noise robustness evaluation ────────────────────────────────────
logging.info("=== Noise Robustness (test set) ===")
for vname in NOISE_VARIANTS:
vdir = os.path.join(noise_root, vname)
paths = {
"text": os.path.join(vdir, "test_text.npy"),
"audio": os.path.join(vdir, "test_audio.npy"),
"vision": os.path.join(vdir, "test_vision.npy"),
}
available = {m: p for m, p in paths.items() if os.path.exists(p)}
if not available:
logging.info(f" [{vname}] SKIP (no noisy modality files)")
continue
missing = sorted(set(paths) - set(available))
if missing:
logging.warning(f" [{vname}] missing noisy files for {missing}; clean same-index modality will be used")
vdata = {m: np.load(p).astype(np.float32) for m, p in available.items()}
# RL-Full under noise
preds_rl, labels = predict_noisy(enc, cls, test_loader, device, vdata, agent=agent)
wf1_rl = float(f1_score(labels, preds_rl, average="weighted", zero_division=0))
# Fixed-Equal under noise
preds_fx, _ = predict_noisy(enc, cls, test_loader, device, vdata,
fixed_weights=[1/3, 1/3, 1/3])
wf1_fx = float(f1_score(labels, preds_fx, average="weighted", zero_division=0))
results[f"noise_{vname}_RL-Full"] = round(wf1_rl, 4)
results[f"noise_{vname}_Fixed-Equal"] = round(wf1_fx, 4)
pct = (1 - wf1_rl / max(wf1_fx, 1e-6)) * 100 # relative degradation vs fixed
logging.info(f" [{vname}] RL={wf1_rl:.4f} Fixed={wf1_fx:.4f} "
f"RL_degradation_vs_clean={pct:+.1f}%")
# ── 5. Save results ───────────────────────────────────────────────────
os.makedirs(os.path.join(PROJ, "outputs", "results"), exist_ok=True)
out_json = args.out_json or os.path.join(PROJ, "outputs", "results", "d1_eval.json")
out_csv = args.out_csv or os.path.join(PROJ, "outputs", "results", "d1_ablation.csv")
with open(out_json, "w") as f:
json.dump(results, f, indent=2)
logging.info(f"Results saved to {out_json}")
# CSV for ablation table
rows = []
for variant in ["RL-Full", "Fixed-Equal", "NoRL-UniformFusion", "StageA-Only"]:
row = {"variant": variant}
for split in ["val", "test"]:
k = f"{variant}_{split}"
if k in results:
row[f"{split}_wf1"] = results[k]["wf1"]
row[f"{split}_acc"] = results[k]["acc"]
if "val_wf1" in row:
rows.append(row)
if rows:
with open(out_csv, "w", newline="") as f:
writer = csv.DictWriter(f, fieldnames=["variant","val_wf1","val_acc","test_wf1","test_acc"])
writer.writeheader()
writer.writerows(rows)
logging.info(f"Ablation CSV saved to {out_csv}")
# Noise robustness summary
logging.info("\n=== Noise Robustness Summary ===")
clean_rl = results.get("RL-Full_test", {}).get("wf1", 0)
clean_fx = results.get("Fixed-Equal_test", {}).get("wf1", 0)
for vname in NOISE_VARIANTS:
rl_k = f"noise_{vname}_RL-Full"
fx_k = f"noise_{vname}_Fixed-Equal"
if rl_k in results and fx_k in results:
rl = results[rl_k]; fx = results[fx_k]
rl_drop = (clean_rl - rl) / max(clean_rl, 1e-6) * 100
fx_drop = (clean_fx - fx) / max(clean_fx, 1e-6) * 100
logging.info(f" {vname:22s} RL_drop={rl_drop:+5.1f}% Fixed_drop={fx_drop:+5.1f}%")
logging.info("Evaluation complete.")
if __name__ == "__main__":
main()
'''
client = paramiko.SSHClient()
client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
client.connect('10.82.3.180', port=20083, username='root', password='m2dGcwyrhI', timeout=30)
sftp = client.open_sftp()
# Make eval dir
_, o, e = client.exec_command(f'mkdir -p {PROJ}/scripts/eval', timeout=10)
o.read(); e.read()
sftp.putfo(__import__('io').BytesIO(EVAL_SCRIPT.encode()), PROJ + '/scripts/eval/eval_d1.py')
print("uploaded: scripts/eval/eval_d1.py")
sftp.close()
client.close()