""" Step 2: LLM judge pre-annotation. Usage: python scripts/annotate_data.py --input data/raw/generated.jsonl \ --output data/processed/annotated.jsonl \ --config configs/data_generation.yaml """ import argparse import json import yaml import random from pathlib import Path from src.data.llm_judge import LLMJudge from src.data.dataset import load_jsonl def split_dataset(samples, train_ratio=0.8, val_ratio=0.1, seed=42): random.seed(seed) random.shuffle(samples) n = len(samples) n_train = int(n * train_ratio) n_val = int(n * val_ratio) return ( samples[:n_train], samples[n_train: n_train + n_val], samples[n_train + n_val:], ) def save_jsonl(samples, path): Path(path).parent.mkdir(parents=True, exist_ok=True) with open(path, "w", encoding="utf-8") as f: for s in samples: f.write(json.dumps(s, ensure_ascii=False) + "\n") print(f"Saved {len(samples)} samples → {path}") def main(): parser = argparse.ArgumentParser() parser.add_argument("--input", required=True) parser.add_argument("--output", default="data/processed/annotated.jsonl") parser.add_argument("--config", default="configs/data_generation.yaml") parser.add_argument("--skip-annotation", action="store_true", help="Skip LLM annotation (use existing labels)") args = parser.parse_args() with open(args.config) as f: cfg = yaml.safe_load(f) samples = load_jsonl(args.input) print(f"Loaded {len(samples)} samples from {args.input}") if not args.skip_annotation: judge = LLMJudge( api_type=cfg["api"]["type"], model=cfg["annotation"]["judge_model"], ) samples = judge.annotate_batch(samples, output_path=args.output) else: save_jsonl(samples, args.output) split_cfg = cfg.get("split", {"train": 0.8, "val": 0.1, "test": 0.1, "seed": 42}) train, val, test = split_dataset( samples, train_ratio=split_cfg["train"], val_ratio=split_cfg["val"], seed=split_cfg.get("seed", 42), ) base = Path(args.output).parent save_jsonl(train, base / "train.jsonl") save_jsonl(val, base / "val.jsonl") save_jsonl(test, base / "test.jsonl") print(f"Split: train={len(train)}, val={len(val)}, test={len(test)}") if __name__ == "__main__": main()