Install
openclaw skills install data-parser-toolkit智能解析CSV、JSON、XLSX、Parquet与SQL文件,自动检测编码并修复常见格式与内容问题,提取结构化数据。
openclaw skills install data-parser-toolkit智能解析各种数据文件格式(CSV/JSON/XLSX/Parquet/SQL),自动检测编码、修复常见问题、提取结构化数据。
常见问题及修复:
自动检测:
# 检测标题行数
def detect_header_lines(content):
lines = content.split('\n')[:10]
for i, line in enumerate(lines):
if '合约代码' in line or '交易代码' in line or 'symbol' in line.lower():
return i
return 1 # 默认1行
常见问题及修复:
\ufeff{"a": 1,} → {"a": 1}{'a': 1} → {"a": 1}# 注释修复函数:
def fix_json(text):
# 移除BOM
text = text.replace('\ufeff', '')
# 修复尾部逗号
text = re.sub(r',(\s*[}\]])', r'\1', text)
# 单引号转双引号
text = re.sub(r"'([^']*)'", r'"\1"', text)
# 移除注释
text = re.sub(r'//.*$', '', text, flags=re.MULTILINE)
text = re.sub(r'#.*$', '', text, flags=re.MULTILINE)
return text
常见问题及修复:
merged_cells 范围data_only=True 读取计算值检测XLSX是否损坏:
import zipfile
import openpyxl
def is_valid_xlsx(path):
try:
# 方法1: 检查ZIP有效性
with zipfile.ZipFile(path, 'r'):
pass
# 方法2: 尝试用openpyxl打开
wb = openpyxl.load_workbook(path, data_only=True)
wb.close()
return True
except:
return False
特点: 高压缩率、适合大数据分析
import pyarrow.parquet as pq
def read_parquet(path):
table = pq.read_table(path)
return table.to_pandas()
常见问题:
CHARSET=utf8mb4INSERT INTO ... VALUES (...), (...), ...\' → ' 或 ''import chardet
def detect_encoding(path):
with open(path, 'rb') as f:
raw = f.read(10000) # 读取前10KB
result = chardet.detect(raw)
return result['encoding'] or 'utf-8'
import pandas as pd
import chardet
def smart_read_csv(path, **kwargs):
# 1. 检测编码
enc = detect_encoding(path)
# 2. 尝试读取
try:
df = pd.read_csv(path, encoding=enc, **kwargs)
except:
# 备用编码
for alt_enc in ['gbk', 'gb2312', 'utf-8-sig', 'latin1']:
try:
df = pd.read_csv(path, encoding=alt_enc, **kwargs)
break
except:
continue
return df
def smart_read_xlsx(path):
"""带自动修复的XLSX读取"""
# 检查文件是否有效
if not is_valid_xlsx(path):
print(f"警告: {path} 可能损坏")
return None
wb = openpyxl.load_workbook(path, data_only=True)
ws = wb.active
# 读取为列表
data = []
for row in ws.iter_rows(values_only=True):
# 跳过全空行
if not any(row):
continue
data.append(list(row))
wb.close()
return data
from data_parser import parse_file
# 自动识别格式并解析
data = parse_file("data.csv") # 返回 DataFrame/List
data = parse_file("data.json") # 返回 dict/List
data = parse_file("data.xlsx") # 返回 List[List]
data = parse_file("data.parquet") # 返回 DataFrame
from data_parser import convert_folder
# 将文件夹内所有XLSX转为CSV
convert_folder(
input_dir="D:/data/xlsx",
output_dir="D:/data/csv",
output_format="csv"
)
pip install pandas openpyxl chardet pyarrow
data_only=True 获取计算值,否则得到公式