#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF 文字提取脚本
功能：从 PDF 文件中提取文本内容并保留格式，输出为 Markdown 格式
"""

import sys
import json
from pathlib import Path

try:
    import fitz  # PyMuPDF
except ImportError:
    print(json.dumps({
        'success': False,
        'error': '缺少依赖：pymupdf。请安装：pip install pymupdf',
        'text': '',
        'page_count': 0
    }, ensure_ascii=False))
    sys.exit(1)


def extract_text_from_pdf(pdf_path: str) -> dict:
    """
    从 PDF 文件中提取文本内容并保留格式
    
    参数:
        pdf_path: PDF 文件路径
    
    返回:
        dict: {
            'success': bool,
            'text': str,  # Markdown 格式的文本
            'page_count': int,
            'error': str (如果失败)
        }
    """
    try:
        # 验证文件存在
        pdf_file = Path(pdf_path)
        if not pdf_file.exists():
            return {
                'success': False,
                'error': f'文件不存在：{pdf_path}',
                'text': '',
                'page_count': 0
            }
        
        # 验证文件格式
        if not pdf_file.suffix.lower() == '.pdf':
            return {
                'success': False,
                'error': f'文件格式错误：{pdf_file.suffix}，仅支持 PDF 格式',
                'text': '',
                'page_count': 0
            }
        
        # 打开 PDF 文件
        doc = fitz.open(pdf_path)
        page_count = len(doc)
        
        if page_count == 0:
            return {
                'success': False,
                'error': 'PDF 文件为空，无任何页面',
                'text': '',
                'page_count': 0
            }
        
        # 提取所有页面的文本
        markdown_content = []
        
        for page_num in range(page_count):
            page = doc[page_num]
            
            # 添加页面分隔符
            if page_num > 0:
                markdown_content.append('\n---\n')
            
            # 提取文本块
            blocks = page.get_text("dict")["blocks"]
            page_text = []
            
            for block in blocks:
                if block["type"] == 0:  # 文本块
                    block_text = []
                    for line in block["lines"]:
                        line_text = ""
                        for span in line["spans"]:
                            text = span["text"].strip()
                            if text:
                                # 检测是否为标题（通过字体大小判断）
                                font_size = span["size"]
                                is_bold = "bold" in span["font"].lower()
                                
                                if font_size > 16:  # 大字体可能是标题
                                    if line_text:
                                        block_text.append(line_text)
                                    line_text = f"## {text}" if not is_bold else f"### {text}"
                                else:
                                    line_text += text + " "
                        
                        if line_text.strip():
                            block_text.append(line_text.strip())
                    
                    if block_text:
                        page_text.append('\n'.join(block_text))
            
            markdown_content.append('\n\n'.join(page_text))
        
        doc.close()
        
        # 合并所有内容
        full_text = '\n'.join(markdown_content)
        
        # 清理多余的空行
        while '\n\n\n' in full_text:
            full_text = full_text.replace('\n\n\n', '\n\n')
        
        return {
            'success': True,
            'text': full_text.strip(),
            'page_count': page_count,
            'error': ''
        }
        
    except Exception as e:
        return {
            'success': False,
            'error': f'处理 PDF 时发生错误：{str(e)}',
            'text': '',
            'page_count': 0
        }


def main():
    """主函数"""
    if len(sys.argv) < 2:
        print(json.dumps({
            'success': False,
            'error': '请提供 PDF 文件路径作为参数',
            'text': '',
            'page_count': 0
        }, ensure_ascii=False))
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    result = extract_text_from_pdf(pdf_path)

    # 输出 JSON 格式结果
    print(json.dumps(result, ensure_ascii=False, indent=2))
    
    # 如果失败，退出码为 1
    if not result['success']:
        sys.exit(1)


if __name__ == '__main__':
    main()