#!/usr/bin/env python3 """ Cross-Reference Audit Script for Multi-Agent LaTeX Reports. When multiple agents write chapters in parallel, duplicate labels, undefined references, and inconsistent cross-references are inevitable. This script catches them all before compilation. Usage: python cross_ref_audit.py path/to/report/ python cross_ref_audit.py path/to/report/ --fix-prefix python cross_ref_audit.py path/to/report/ --verbose """ import re import sys from pathlib import Path from collections import defaultdict from dataclasses import dataclass, field @dataclass class AuditResult: duplicate_labels: dict[str, list[str]] = field(default_factory=dict) undefined_refs: dict[str, list[str]] = field(default_factory=dict) undefined_cites: dict[str, list[str]] = field(default_factory=dict) orphaned_labels: list[str] = field(default_factory=list) bib_duplicates: list[str] = field(default_factory=list) warnings: list[str] = field(default_factory=list) @property def has_errors(self) -> bool: return bool( self.duplicate_labels or self.undefined_refs or self.undefined_cites or self.bib_duplicates ) @property def error_count(self) -> int: return ( len(self.duplicate_labels) + len(self.undefined_refs) + len(self.undefined_cites) + len(self.bib_duplicates) ) def find_tex_files(directory: Path) -> list[Path]: """Find all .tex files recursively.""" return sorted(directory.rglob("*.tex")) def find_bib_files(directory: Path) -> list[Path]: """Find all .bib files recursively.""" return sorted(directory.rglob("*.bib")) def extract_labels(filepath: Path) -> list[tuple[str, int]]: """Extract all \\label{} definitions with line numbers.""" labels = [] label_pattern = re.compile(r"\\label\{([^}]+)\}") content = filepath.read_text(encoding="utf-8", errors="replace") for i, line in enumerate(content.splitlines(), 1): if line.strip().startswith("%"): continue for match in label_pattern.finditer(line): labels.append((match.group(1), i)) return labels def extract_refs(filepath: Path) -> list[tuple[str, int]]: """Extract all \\ref{}, \\cref{}, \\Cref{}, \\autoref{} references.""" refs = [] ref_pattern = re.compile( r"\\(?:c?ref|C?ref|autoref|eqref|pageref)\{([^}]+)\}" ) content = filepath.read_text(encoding="utf-8", errors="replace") for i, line in enumerate(content.splitlines(), 1): if line.strip().startswith("%"): continue for match in ref_pattern.finditer(line): # Handle comma-separated refs like \cref{fig:a,fig:b} for ref in match.group(1).split(","): refs.append((ref.strip(), i)) return refs def extract_cites(filepath: Path) -> list[tuple[str, int]]: """Extract all \\cite{}, \\citep{}, \\citet{} etc.""" cites = [] cite_pattern = re.compile( r"\\(?:cite[tp]?|citealp|citeauthor|citeyear)\{([^}]+)\}" ) content = filepath.read_text(encoding="utf-8", errors="replace") for i, line in enumerate(content.splitlines(), 1): if line.strip().startswith("%"): continue for match in cite_pattern.finditer(line): for key in match.group(1).split(","): cites.append((key.strip(), i)) return cites def extract_bib_keys(filepath: Path) -> list[tuple[str, int]]: """Extract all BibTeX entry keys.""" keys = [] entry_pattern = re.compile(r"@\w+\{([^,\s]+)") content = filepath.read_text(encoding="utf-8", errors="replace") for i, line in enumerate(content.splitlines(), 1): for match in entry_pattern.finditer(line): keys.append((match.group(1), i)) return keys def audit(directory: Path, verbose: bool = False) -> AuditResult: """Run full cross-reference audit on a LaTeX project directory.""" result = AuditResult() tex_files = find_tex_files(directory) bib_files = find_bib_files(directory) if not tex_files: result.warnings.append(f"No .tex files found in {directory}") return result if verbose: print(f"Scanning {len(tex_files)} .tex files, {len(bib_files)} .bib files") # Collect all labels, refs, cites all_labels: dict[str, list[str]] = defaultdict(list) all_refs: dict[str, list[str]] = defaultdict(list) all_cites: dict[str, list[str]] = defaultdict(list) all_bib_keys: dict[str, list[str]] = defaultdict(list) for f in tex_files: relpath = str(f.relative_to(directory)) for label, line in extract_labels(f): all_labels[label].append(f"{relpath}:{line}") for ref, line in extract_refs(f): all_refs[ref].append(f"{relpath}:{line}") for cite, line in extract_cites(f): all_cites[cite].append(f"{relpath}:{line}") for f in bib_files: relpath = str(f.relative_to(directory)) for key, line in extract_bib_keys(f): all_bib_keys[key].append(f"{relpath}:{line}") # Check 1: Duplicate labels for label, locations in all_labels.items(): if len(locations) > 1: result.duplicate_labels[label] = locations # Check 2: Undefined references defined_labels = set(all_labels.keys()) for ref, locations in all_refs.items(): if ref not in defined_labels: result.undefined_refs[ref] = locations # Check 3: Undefined citations defined_bib_keys = set(all_bib_keys.keys()) for cite, locations in all_cites.items(): if cite not in defined_bib_keys: result.undefined_cites[cite] = locations # Check 4: Orphaned labels (defined but never referenced) referenced_labels = set(all_refs.keys()) for label in defined_labels: if label not in referenced_labels: result.orphaned_labels.append(label) # Check 5: Duplicate BibTeX keys for key, locations in all_bib_keys.items(): if len(locations) > 1: result.bib_duplicates.append(f"{key} defined at: {', '.join(locations)}") # Warnings: labels without chapter prefix for label in defined_labels: if ":" not in label: result.warnings.append( f"Label '{label}' has no prefix — consider using ch1:, fig:ch3:, etc." ) return result def print_report(result: AuditResult) -> None: """Print a formatted audit report.""" print("\n" + "=" * 60) print(" CROSS-REFERENCE AUDIT REPORT") print("=" * 60) if not result.has_errors and not result.orphaned_labels and not result.warnings: print("\n All clear — no issues found.\n") return # Errors if result.duplicate_labels: print(f"\n DUPLICATE LABELS ({len(result.duplicate_labels)})") print(" " + "-" * 40) for label, locations in sorted(result.duplicate_labels.items()): print(f" \\label{{{label}}}") for loc in locations: print(f" -> {loc}") if result.undefined_refs: print(f"\n UNDEFINED REFERENCES ({len(result.undefined_refs)})") print(" " + "-" * 40) for ref, locations in sorted(result.undefined_refs.items()): print(f" \\ref{{{ref}}}") for loc in locations: print(f" -> {loc}") if result.undefined_cites: print(f"\n UNDEFINED CITATIONS ({len(result.undefined_cites)})") print(" " + "-" * 40) for cite, locations in sorted(result.undefined_cites.items()): print(f" \\cite{{{cite}}}") for loc in locations: print(f" -> {loc}") if result.bib_duplicates: print(f"\n DUPLICATE BIB KEYS ({len(result.bib_duplicates)})") print(" " + "-" * 40) for dup in result.bib_duplicates: print(f" {dup}") # Warnings if result.orphaned_labels: print(f"\n ORPHANED LABELS ({len(result.orphaned_labels)})") print(" " + "-" * 40) for label in sorted(result.orphaned_labels): print(f" \\label{{{label}}} — defined but never referenced") if result.warnings: print(f"\n WARNINGS ({len(result.warnings)})") print(" " + "-" * 40) for warning in result.warnings[:20]: # Limit output print(f" {warning}") if len(result.warnings) > 20: print(f" ... and {len(result.warnings) - 20} more") # Summary print("\n" + "=" * 60) if result.has_errors: print(f" RESULT: {result.error_count} ERRORS found — fix before compiling") else: print(" RESULT: No errors (warnings only)") print("=" * 60 + "\n") def main(): import argparse parser = argparse.ArgumentParser( description="Audit LaTeX cross-references for multi-agent reports" ) parser.add_argument("directory", help="Path to LaTeX project directory") parser.add_argument( "--verbose", "-v", action="store_true", help="Verbose output" ) parser.add_argument( "--json", action="store_true", help="Output as JSON" ) args = parser.parse_args() directory = Path(args.directory) if not directory.is_dir(): print(f"Error: {directory} is not a directory", file=sys.stderr) sys.exit(1) result = audit(directory, verbose=args.verbose) if args.json: import json output = { "duplicate_labels": result.duplicate_labels, "undefined_refs": result.undefined_refs, "undefined_cites": result.undefined_cites, "orphaned_labels": result.orphaned_labels, "bib_duplicates": result.bib_duplicates, "warnings": result.warnings, "error_count": result.error_count, } print(json.dumps(output, indent=2)) else: print_report(result) sys.exit(1 if result.has_errors else 0) if __name__ == "__main__": main()