Install
openclaw skills install doc-extract-filter支持 PDF、Word、Excel 文件的文本提取和按关键词筛选,返回完整或筛选后的文本内容。
openclaw skills install doc-extract-filter{
"name": "doc-extract-filter",
"description": "文件处理技能,支持多种文件格式的文本提取、关键词/正则表达式筛选、排除筛选和批量文件处理",
"version": "1.1.1",
"author": "file-agent team",
"license": "MIT-0",
"type": "tool",
"entry_point": "scripts/doc-extract-filter.py",
"parameters": {
"file_path": {
"type": "string",
"description": "文件路径",
"required": false
},
"action": {
"type": "string",
"description": "操作类型:extract 或 filter",
"required": true
},
"keywords": {
"type": "array",
"description": "关键词列表(仅 filter 操作需要)",
"required": false
},
"regex": {
"type": "string",
"description": "正则表达式模式(仅 filter 操作需要)",
"required": false
},
"enable_ocr": {
"type": "boolean",
"description": "启用 OCR 支持(用于扫描件 PDF)",
"required": false
},
"exclude_keywords": {
"type": "array",
"description": "排除关键词列表(仅 filter 操作需要)",
"required": false
},
"exclude_regex": {
"type": "string",
"description": "排除正则表达式模式(仅 filter 操作需要)",
"required": false
},
"context_length": {
"type": "integer",
"description": "上下文长度(默认50字符)",
"required": false
},
"filter_level": {
"type": "string",
"description": "筛选级别:line(按行)或 paragraph(按段落)",
"required": false
},
"batch": {
"type": "boolean",
"description": "开启批量处理模式",
"required": false
},
"input_dir": {
"type": "string",
"description": "批量处理的输入文件夹路径",
"required": false
},
"file_paths": {
"type": "array",
"description": "批量处理的文件列表",
"required": false
},
"output_dir": {
"type": "string",
"description": "批量结果输出目录",
"required": false
},
"merge_results": {
"type": "boolean",
"description": "是否合并所有文件结果为一个 JSON 文件",
"required": false
}
}
}
name: doc-extract-filter
description: 文件处理技能,支持多种文件格式的文本提取、关键词/正则表达式筛选、排除筛选和批量文件处理
version: 1.1.1
author: file-agent team
license: MIT-0
type: tool
entry_point: scripts/doc-extract-filter.py
parameters:
file_path:
type: string
description: 文件路径
required: false
action:
type: string
description: 操作类型:extract 或 filter
required: true
keywords:
type: array
description: 关键词列表(仅 filter 操作需要)
required: false
regex:
type: string
description: 正则表达式模式(仅 filter 操作需要)
required: false
enable_ocr:
type: boolean
description: 启用 OCR 支持(用于扫描件 PDF)
required: false
exclude_keywords:
type: array
description: 排除关键词列表(仅 filter 操作需要)
required: false
exclude_regex:
type: string
description: 排除正则表达式模式(仅 filter 操作需要)
required: false
context_length:
type: integer
description: 上下文长度(默认50字符)
required: false
filter_level:
type: string
description: 筛选级别:line(按行)或 paragraph(按段落)
required: false
batch:
type: boolean
description: 开启批量处理模式
required: false
input_dir:
type: string
description: 批量处理的输入文件夹路径
required: false
file_paths:
type: array
description: 批量处理的文件列表
required: false
output_dir:
type: string
description: 批量结果输出目录
required: false
merge_results:
type: boolean
description: 是否合并所有文件结果为一个 JSON 文件
required: false
# 单个文件处理
python scripts/doc-extract-filter.py --file_path "path/to/file.pdf" --action "extract"
python scripts/doc-extract-filter.py --file_path "path/to/file.pdf" --action "filter" --keywords "关键词1,关键词2"
python scripts/doc-extract-filter.py --file_path "path/to/file.pdf" --action "filter" --regex "\d{4}-\d{2}-\d{2}"
# 提取 PDF 扫描件(启用 OCR)
python scripts/doc-extract-filter.py --file_path "path/to/scanned.pdf" --action "extract" --enable-ocr
# 筛选并排除指定内容
python scripts/doc-extract-filter.py --file_path "path/to/file.pdf" --action "filter" --keywords "关键词" --exclude-keywords "排除词"
# 设置上下文长度和筛选级别
python scripts/doc-extract-filter.py --file_path "path/to/file.pdf" --action "filter" --keywords "关键词" --context-length 100 --filter-level "paragraph"
# 批量处理 - 文件夹路径
python scripts/doc-extract-filter.py --batch --input-dir "path/to/folder" --action "extract" --output-dir "batch-results"
# 批量处理 - 文件列表
python scripts/doc-extract-filter.py --batch --file-paths "path/to/file1.pdf,path/to/file2.docx" --action "extract" --output-dir "batch-results"
# 批量处理并合并结果
python scripts/doc-extract-filter.py --batch --input-dir "path/to/folder" --action "extract" --output-dir "batch-results" --merge-results
# 批量筛选
python scripts/doc-extract-filter.py --batch --input-dir "path/to/folder" --action "filter" --keywords "关键词" --output-dir "batch-results"
from scripts.doc_extract_filter import DocExtractFilter
# 提取文本
result = DocExtractFilter.process("path/to/file.pdf", "extract")
# 提取 PDF 扫描件(启用 OCR)
result = DocExtractFilter.process("path/to/scanned.pdf", "extract", enable_ocr=True)
# 筛选关键词
result = DocExtractFilter.process("path/to/file.pdf", "filter", ["关键词1", "关键词2"])
# 筛选并排除指定内容
result = DocExtractFilter.process("path/to/file.pdf", "filter", ["关键词"], exclude_keywords=["排除词"])
# 设置上下文长度和筛选级别
result = DocExtractFilter.process("path/to/file.pdf", "filter", ["关键词"], context_length=100, filter_level="paragraph")
# 使用正则表达式筛选
result = DocExtractFilter.process("path/to/file.pdf", "filter", regex_pattern="\d{4}-\d{2}-\d{2}")
# 批量处理 - 文件夹路径
result = DocExtractFilter.batch_process(
input_dir="path/to/folder",
action="extract",
output_dir="batch-results"
)
# 批量处理 - 文件列表
result = DocExtractFilter.batch_process(
file_paths=["path/to/file1.pdf", "path/to/file2.docx"],
action="extract",
output_dir="batch-results"
)
# 批量处理并合并结果
result = DocExtractFilter.batch_process(
input_dir="path/to/folder",
action="extract",
output_dir="batch-results",
merge_results=True
)
{
"success": true,
"data": {
"text": "提取的文本内容",
"filtered_text": "筛选后的文本内容" // 仅 filter 操作返回
},
"error": ""
}
doc-extract-filter 目录复制到 OpenClaw/CoPaw 的 skills 目录pip install -r requirements.txt 安装依赖使用 docs/test.pdf 文件测试功能:
# 测试提取文本
python scripts/doc-extract-filter.py --file_path "docs/test.pdf" --action "extract"
# 测试关键词筛选
python scripts/doc-extract-filter.py --file_path "docs/test.pdf" --action "filter" --keywords "单价,小计,总金额"
# 测试排除筛选
python scripts/doc-extract-filter.py --file_path "docs/test.pdf" --action "filter" --keywords "单价" --exclude-keywords "小计"
doc-extract-filter 现在包含了所有必要的核心代码,可以独立运行,不依赖于外部的 src 目录。