#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
HTML完整性检查脚本
用于检查和修复HTML报告中的缺失内容
"""
import argparse
import json
import os
import re
import sys
from datetime import datetime
def generate_default_action(score):
"""根据评分生成默认行动处方"""
try:
score = int(score)
except:
score = 0
if score >= 80:
return '''
行动处方
1. 保持当前内容策略,持续优化细节
2. 关注粉丝反馈,增强互动粘性
3. 探索变现路径,提升商业价值
'''
elif score >= 60:
return '''
行动处方
1. 强化内容垂直度,明确账号定位
2. 提升更新频率,保持稳定输出
3. 优化封面视觉,提升点击率
'''
else:
return '''
行动处方
1. 明确账号定位,聚焦垂直领域
2. 提升内容质量,增加互动引导
3. 保持稳定更新,积累账号权重
'''
def check_and_fix_html_content(html, report_data):
"""检查HTML内容完整性,识别缺失字段并修复
返回: (修复后的html, 缺失字段列表)
"""
missing_fields = []
# 定义关键字段及其默认值
critical_fields = {
# 头部信息
"账号名": report_data.get("header", {}).get("账号名", "未知账号"),
"账号标识": report_data.get("header", {}).get("账号标识", "素人"),
"数据获取时间": report_data.get("header", {}).get("数据获取时间", datetime.now().strftime("%Y-%m-%d %H:%M")),
"综合评分": report_data.get("scores", {}).get("综合评分", "0"),
# 评分字段
"账号定位得分": report_data.get("scores", {}).get("账号定位得分", "0"),
"粉丝画像与需求得分": report_data.get("scores", {}).get("粉丝画像与需求得分", "0"),
"选题体系得分": report_data.get("scores", {}).get("选题体系得分", "0"),
"封面风格得分": report_data.get("scores", {}).get("封面风格得分", "0"),
"爆文能力得分": report_data.get("scores", {}).get("爆文能力得分", "0"),
"互动规模得分": report_data.get("scores", {}).get("互动规模得分", "0"),
"更新产能得分": report_data.get("scores", {}).get("更新产能得分", "0"),
# 简述原因
"账号定位简述原因": report_data.get("scores", {}).get("账号定位简述原因", "账号定位清晰"),
"粉丝画像与需求简述原因": report_data.get("scores", {}).get("粉丝画像与需求简述原因", "粉丝画像较清晰"),
"选题体系简述原因": report_data.get("scores", {}).get("选题体系简述原因", "选题方向明确"),
"封面风格简述原因": report_data.get("scores", {}).get("封面风格简述原因", "封面风格统一"),
"爆文能力简述原因": report_data.get("scores", {}).get("爆文能力简述原因", "爆文表现良好"),
"互动规模简述原因": report_data.get("scores", {}).get("互动规模简述原因", "互动表现良好"),
"更新产能简述原因": report_data.get("scores", {}).get("更新产能简述原因", "更新频率稳定"),
}
# 检查HTML中的关键模块是否有内容
# 1. 检查更新时间
update_time_match = re.search(r'数据获取时间[::]\s*\s*([^<]*)', html)
if update_time_match:
value = update_time_match.group(1).strip()
if not value or value == "":
missing_fields.append("数据获取时间")
fix_value = report_data.get("header", {}).get("数据获取时间", datetime.now().strftime("%Y-%m-%d %H:%M"))
html = html.replace(
f'{value}',
f'{fix_value}',
1
)
# 2. 检查行动处方模块
action_section_match = re.search(r'(.*?)
\s*', html, re.DOTALL)
if action_section_match:
action_content = action_section_match.group(1)
if "问题归因" in action_content and ("暂无" in action_content or action_content.count(").*?(
\s*)',
r'\1' + default_action + r'\2',
html,
flags=re.DOTALL
)
# 3. 检查综合诊断结论
conclusion_match = re.search(r'综合诊断结论.*?(.*?)
', html, re.DOTALL)
if conclusion_match:
conclusion_content = conclusion_match.group(1).strip()
if not conclusion_content or len(conclusion_content) < 20:
missing_fields.append("综合诊断结论")
account_name = report_data.get("header", {}).get("账号名", "该账号")
default_conclusion = f"{account_name}整体运营状况良好,建议持续优化内容质量和互动策略。"
html = re.sub(
r'().*?(
)',
rf'\1{default_conclusion}\2',
html,
flags=re.DOTALL
)
# 4. 检查账号定位模块关键字段
positioning_fields = ["TA是谁", "心智占位", "价值观锚点", "吸引力类型"]
for pf in positioning_fields:
pattern = rf'{pf}[::]\s*\s*([^<]*)'
match = re.search(pattern, html)
if match:
value = match.group(1).strip()
if not value:
missing_fields.append(pf)
# 5. 检查粉丝画像模块
fans_profile_match = re.search(r'粉丝构成[::]\s*\s*([^<]*)', html)
if fans_profile_match:
value = fans_profile_match.group(1).strip()
if not value:
missing_fields.append("粉丝构成")
raw = report_data.get("_raw", {})
fans = raw.get("fans", 0)
gender = raw.get("fansGender", {})
female_ratio = gender.get("female_ratio", "0.5")
try:
fix_value = f"{fans}粉丝 | 女性{float(female_ratio)*100:.0f}%"
except:
fix_value = f"{fans}粉丝"
html = html.replace(
f'{value}',
f'{fix_value}',
1
)
# 6. 检查选题体系模块
topic_fields = ["选题方向", "表达风格", "叙事手法"]
for tf in topic_fields:
pattern = rf'{tf}[::]\s*\s*([^<]*)'
match = re.search(pattern, html)
if match:
value = match.group(1).strip()
if not value:
missing_fields.append(tf)
# 7. 检查封面风格模块
cover_fields = ["视觉特征", "信息层级", "一致性"]
for cf in cover_fields:
pattern = rf'{cf}[::]\s*\s*([^<]*)'
match = re.search(pattern, html)
if match:
value = match.group(1).strip()
if not value:
missing_fields.append(cf)
# 8. 检查更新产能模块
update_fields = ["频率分析", "发布时间分析"]
for uf in update_fields:
pattern = rf'{uf}[::]\s*\s*([^<]*)'
match = re.search(pattern, html)
if match:
value = match.group(1).strip()
if not value:
missing_fields.append(uf)
return html, missing_fields
def check_multi_html_content(html, accounts):
"""检查多账号HTML内容完整性
返回: (修复后的html, 缺失字段列表)
"""
missing_fields = []
# 检查每个账号的关键信息
for i, acc in enumerate(accounts):
header = acc.get("header", {})
scores = acc.get("scores", {})
name = header.get("账号名", "")
if not name:
missing_fields.append(f"账号{i+1}-账号名")
total_score = scores.get("综合评分", "")
if not total_score:
missing_fields.append(f"账号{i+1}-综合评分")
# 检查对比总结是否存在
if "核心差异" not in html or len(re.findall(r"核心差异", html)) == 0:
missing_fields.append("核心差异")
default_summary = '''
核心差异
各账号在定位、内容策略上存在差异,建议针对性优化。
'''
html = html.replace('