#!/usr/bin/env python3 """Multi-backend PDF extraction & triage for Chinese resume screening. Runs available PDF text backends (PyMuPDF / pdfplumber / pdfminer.six) against each PDF, cleans the output, scores it, and writes the best extraction to /.txt. A `summary.json` and a Markdown table are also emitted so a downstream screening step can decide which resumes are high / medium / low quality. Usage ----- python3 resume_pdf_extract.py path/to/resume.pdf --output-dir /tmp/out python3 resume_pdf_extract.py path/to/resume_dir --output-dir /tmp/out Install (any subset works; more backends → better triage): pip install --user pymupdf pdfplumber pdfminer.six License: MIT. See repository LICENSE. """ from __future__ import annotations import argparse import json import re import sys import unicodedata from dataclasses import dataclass, asdict from pathlib import Path from typing import Callable, Dict, List, Optional # Keyword bag used to compute a coarse "is this actually a resume" hit score. # Tuned for Chinese tech / product / market resumes — extend per locale. KEYWORDS = [ "工作经历", "工作经验", "项目经验", "教育背景", "教育经历", "实习经历", "实习经验", "技能", "项目", "负责", "主导", "参与", "团队", "毕业", "学位", "本科", "硕士", "博士", "GPA", "Python", "Java", "C++", "JavaScript", "TypeScript", "Go", "算法", "模型", "数据", "产品", "运营", "市场", "增长", ] # Tracking/decoration patterns we strip from raw text before scoring. TRACKING_LINE = re.compile(r"^[A-Za-z0-9_\-]{16,}$") SHORT_FRAGMENT = re.compile(r"^[\s\W]{0,2}[A-Za-z0-9]{1,2}[\s\W]{0,2}$") REPEATED_CHAR_LINE = re.compile(r"^(?:(\S)\s*){2,}$") # --------------------------------------------------------------------------- # Backend wrappers # --------------------------------------------------------------------------- def _try_import(module: str): try: return __import__(module) except Exception: return None def _warn(backend: str, path: Path, err: Exception) -> None: print(f"warn: {backend} failed on {path.name}: {type(err).__name__}: {err}", file=sys.stderr) def _extract_fitz(path: Path) -> Optional[str]: fitz = _try_import("fitz") if fitz is None: return None try: with fitz.open(path) as doc: return "\n".join(page.get_text() for page in doc) except Exception as e: _warn("fitz", path, e) return None def _extract_pdfplumber(path: Path) -> Optional[str]: pdfplumber = _try_import("pdfplumber") if pdfplumber is None: return None try: chunks: List[str] = [] with pdfplumber.open(str(path)) as pdf: for page in pdf.pages: text = page.extract_text() or "" chunks.append(text) return "\n".join(chunks) except Exception as e: _warn("pdfplumber", path, e) return None def _extract_pdfminer(path: Path) -> Optional[str]: try: from pdfminer.high_level import extract_text # type: ignore except Exception: return None try: return extract_text(str(path)) except Exception as e: _warn("pdfminer", path, e) return None BACKENDS: Dict[str, Callable[[Path], Optional[str]]] = { "fitz": _extract_fitz, "pdfplumber": _extract_pdfplumber, "pdfminer": _extract_pdfminer, } # --------------------------------------------------------------------------- # Cleaning + scoring # --------------------------------------------------------------------------- def clean_text(raw: str) -> str: """Normalize Unicode, fold whitespace, drop tracking/fragment lines.""" if not raw: return "" text = unicodedata.normalize("NFKC", raw) text = text.replace(" ", " ") text = re.sub(r"[ \t]+", " ", text) keep: List[str] = [] for line in text.splitlines(): stripped = line.strip() if not stripped: keep.append("") continue if TRACKING_LINE.match(stripped): continue if SHORT_FRAGMENT.match(stripped): continue if REPEATED_CHAR_LINE.match(stripped): continue keep.append(stripped) cleaned = "\n".join(keep) cleaned = re.sub(r"\n{3,}", "\n\n", cleaned) return cleaned.strip() @dataclass class BackendResult: backend: str chars: int lines: int keyword_hits: int score: float quality: str text: str def score_text(text: str) -> BackendResult: if not text: return BackendResult("none", 0, 0, 0, 0.0, "low", "") lines = [l for l in text.splitlines() if l.strip()] chars = len(text) line_count = len(lines) keyword_hits = sum(1 for kw in KEYWORDS if kw.lower() in text.lower()) # Score: weighted blend of size + keyword recall, penalize tracking noise. size_component = min(chars, 8000) / 80.0 # up to ~100 line_component = min(line_count, 200) / 2.0 # up to ~100 keyword_component = min(keyword_hits, 20) * 5.0 # up to ~100 score = round((size_component + line_component + keyword_component) / 3.0, 2) if chars >= 2600 and line_count >= 55 and score >= 65: quality = "high" elif chars >= 1600 and line_count >= 35 and score >= 35: quality = "medium" else: quality = "low" return BackendResult( backend="", chars=chars, lines=line_count, keyword_hits=keyword_hits, score=score, quality=quality, text=text, ) # --------------------------------------------------------------------------- # Triage driver # --------------------------------------------------------------------------- def triage_pdf(path: Path) -> Dict: candidates: List[BackendResult] = [] for name, fn in BACKENDS.items(): raw = fn(path) if raw is None: continue cleaned = clean_text(raw) result = score_text(cleaned) result.backend = name candidates.append(result) if not candidates: return { "file": str(path), "backend": None, "chars": 0, "lines": 0, "keyword_hits": 0, "score": 0.0, "quality": "low", "reason": "no available backend produced text", } best = max(candidates, key=lambda r: r.score) return { "file": str(path), "backend": best.backend, "chars": best.chars, "lines": best.lines, "keyword_hits": best.keyword_hits, "score": best.score, "quality": best.quality, "text": best.text, "candidates": [ { **{k: v for k, v in asdict(c).items() if k != "text"}, } for c in candidates ], } def iter_pdfs(target: Path): if target.is_file(): yield target return for entry in sorted(target.rglob("*.pdf")): if entry.is_file(): yield entry def render_markdown(rows: List[Dict]) -> str: header = "| 文件 | 后端 | 质量 | 字符数 | 行数 | 关键词命中 | 评分 |\n" sep = "| --- | --- | --- | ---: | ---: | ---: | ---: |\n" body = "".join( "| {file} | {backend} | {quality} | {chars} | {lines} | {kw} | {score} |\n".format( file=Path(row["file"]).name, backend=row.get("backend") or "—", quality=row.get("quality", "low"), chars=row.get("chars", 0), lines=row.get("lines", 0), kw=row.get("keyword_hits", 0), score=row.get("score", 0.0), ) for row in rows ) return header + sep + body def main() -> int: parser = argparse.ArgumentParser(description=__doc__.split("\n\n")[0]) parser.add_argument("target", type=Path, help="PDF file or directory") parser.add_argument("--output-dir", type=Path, required=True, help="Directory to write best-text .txt files and summary.json") parser.add_argument("--no-markdown", action="store_true", help="Suppress the Markdown table on stdout") args = parser.parse_args() if not args.target.exists(): print(f"error: target does not exist: {args.target}", file=sys.stderr) return 2 args.output_dir.mkdir(parents=True, exist_ok=True) available: List[str] = [] for name in BACKENDS: module_name = {"fitz": "fitz", "pdfplumber": "pdfplumber", "pdfminer": "pdfminer.high_level"}[name] try: __import__(module_name.split(".")[0]) available.append(name) except Exception: pass if not available: print("error: no PDF backend available. Install at least one of " "pymupdf / pdfplumber / pdfminer.six", file=sys.stderr) return 2 rows: List[Dict] = [] for pdf in iter_pdfs(args.target): result = triage_pdf(pdf) text = result.pop("text", "") if isinstance(result, dict) else "" if result.get("backend"): out_path = args.output_dir / f"{pdf.stem}.{result['backend']}.txt" out_path.write_text(text, encoding="utf-8") result["output"] = str(out_path) rows.append(result) summary_path = args.output_dir / "summary.json" summary_path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8") if not args.no_markdown: print(render_markdown(rows)) print(f"\nbackends available: {', '.join(available)}", file=sys.stderr) print(f"summary: {summary_path}", file=sys.stderr) return 0 if __name__ == "__main__": raise SystemExit(main())