#!/usr/bin/env python3 """ 小红书热门笔记搜索脚本(支持 HTML 卡片布局输出) 基于红狐数据API,支持关键词搜索、分页、时间筛选 """ import sys import os import argparse import json import urllib.request import urllib.error def parse_count(value): """解析数量,支持 "17w+"、"1.5w" 格式""" if value is None: return 0 if isinstance(value, int): return value value_str = str(value).replace('+', '').replace(',', '').strip() # 处理 "w" 或 "W"(万) if 'w' in value_str.lower(): value_str = value_str.lower().replace('w', '') try: return int(float(value_str) * 10000) except: return 0 try: return int(float(value_str)) except: return 0 def fuzzy_count(value): """对5000+的互动数做模糊处理,5000以下保留原始数值""" if value is None: return '--' num = parse_count(value) if num <= 0: return '--' if num < 5000: return str(num) if num < 10000: return '5000+' # 1万以上:以万为单位,向下取整 wan = num // 10000 return f'{wan}w+' def get_api_key(): """从环境变量 REDFOX_API_KEY 获取 API Key,未配置时报错退出""" api_key = os.environ.get("REDFOX_API_KEY", "") if not api_key: print("❌ 未找到 API Key,请配置环境变量 REDFOX_API_KEY。", file=sys.stderr) print(" 示例:export REDFOX_API_KEY=your_api_key_here", file=sys.stderr) sys.exit(1) return api_key def fetch_xhs_hot_notes(keyword: str, debug: bool = False, max_retries: int = 3, start_date: str = None, end_date: str = None, page_num: int = 1, page_size: int = 50): """调用接口获取小红书热门笔记数据""" # API Key(从环境变量 REDFOX_API_KEY 读取,支持 shell 配置文件回退) api_key = get_api_key() # 构建请求 url = "https://redfox.hk/story/api/xhs/search/search" headers = { "Content-Type": "application/json", "X-API-KEY": api_key } payload = { "keyword": keyword, "pageNum": page_num, "pageSize": page_size, "startDate": start_date or "", "endDate": end_date or "", "source": "小红书笔记创作-ClawHub" } last_error = None for attempt in range(max_retries): try: if debug: print(f"\n=== DEBUG: 第 {attempt + 1} 次尝试 ===", file=sys.stderr) print(f"请求参数: {json.dumps(payload, ensure_ascii=False)}", file=sys.stderr) body = json.dumps(payload, ensure_ascii=False).encode("utf-8") req = urllib.request.Request(url, data=body, headers=headers, method="POST") with urllib.request.urlopen(req, timeout=30) as resp: status_code = resp.status resp_bytes = resp.read() resp_text = resp_bytes.decode("utf-8") if debug: print(f"状态码: {status_code}", file=sys.stderr) print(f"响应长度: {len(resp_text)} 字节", file=sys.stderr) if status_code >= 400: raise Exception(f"HTTP请求失败: 状态码 {status_code}, {resp_text[:200]}") data = json.loads(resp_text) # 检查返回码 if data.get("code") != 2000: raise Exception(f"API 错误: {data.get('msg', '未知错误')}") result_data = data.get("data", {}) if debug: print("=== DEBUG: API 返回的 data 字段键 ===", file=sys.stderr) print(json.dumps(list(result_data.keys()), ensure_ascii=False, indent=2), file=sys.stderr) print(f"总条数: {result_data.get('total', 0)}", file=sys.stderr) articles = result_data.get("articles", []) return { "keyword": result_data.get("keyword", keyword), "articles": articles, "total": len(articles), "pageNum": result_data.get("pageNum", page_num), "pageSize": result_data.get("pageSize", page_size), "hotTopics": result_data.get("hotTopics", []), "relatedSearches": result_data.get("relatedSearches", []), "latestHotArticles": result_data.get("latestHotArticles", []) } except urllib.error.URLError as e: last_error = f"请求失败: {str(e)}" if debug: print(f" 错误: {str(e)[:100]}", file=sys.stderr) import time if attempt < max_retries - 1: time.sleep(2 ** attempt) continue except Exception as e: last_error = str(e) if debug: print(f" 错误: {str(e)[:100]}", file=sys.stderr) import time if attempt < max_retries - 1: time.sleep(2 ** attempt) continue raise Exception(f"{last_error}(已尝试 {max_retries} 次)") def get_cover_urls(data, max_items=10): """提取所有封面图URL""" urls = [] articles = data.get("articles", [])[:max_items] for item in articles: cover_url = item.get('cover', '') note_id = item.get('id', '') title = (item.get('title', '') or item.get('desc', ''))[:30] if cover_url and note_id: urls.append({ 'title': title, 'note_id': note_id, 'cover_url': cover_url, 'link': item.get('shareInfoLink', f"https://www.xiaohongshu.com/explore/{note_id}") }) return urls def get_top_articles(data, max_items=10): """ 获取文章列表(按接口原始返回顺序,截取前 max_items 条) """ articles = data.get("articles", [])[:max_items] return articles def format_as_html(data: dict, max_items: int = 10, start_date: str = None): """ 格式化输出热门笔记数据(HTML 卡片布局) """ from datetime import datetime keyword = data.get("keyword", "") total = data.get("total", 0) is_full_site = not keyword or keyword.strip() == "" def process_title(item): """处理标题""" title = item.get('title', '') if not title or title.strip() == '': desc = item.get('desc', '') if desc: title = desc.replace('\n', ' ').replace('\r', ' ').strip()[:30] if len(desc) > 30: title = title + '...' if not title or title.strip() == '': title = '无标题' title = title.replace('<', '<').replace('>', '>').replace('"', '"') return title def format_time(item): """格式化发布时间""" create_time = item.get('createTime', '') if create_time: try: month = int(create_time[5:7]) day = int(create_time[8:10]) return f"{month}月{day}日" except: pass return '--' def generate_card(item, idx): """生成单个卡片 HTML""" note_id = item.get('id', '') author_id = item.get('authorId', '') author_name = item.get('authorNickname', '未知') fans = item.get('authorFans', 0) title = process_title(item) pub_time = format_time(item) interactive_count = fuzzy_count(item.get('interactiveCount', 0)) like_count = fuzzy_count(item.get('likedCount', 0)) collect_count = fuzzy_count(item.get('collectedCount', 0)) # 作品链接 note_link = item.get('shareInfoLink') or f"https://www.xiaohongshu.com/explore/{note_id}" # 作者主页链接 author_link = f"https://www.xiaohongshu.com/user/profile/{author_id}" if author_id else "#" relevance_score = item.get('relevanceScore', 0) popularity_score = item.get('popularityScore', 0) recency_score = item.get('recencyScore', 0) total_score = item.get('totalScore', 0) # 评分标签(全站热门时不展示) scores_html = '' if not is_full_site: scores_html = f'''
相关性 {relevance_score} 热度 {popularity_score} 时效 {recency_score}
''' card_html = f'''
{idx + 1}. {title}
{author_name}({fuzzy_count(fans)}粉) · 发布日期:{pub_time}
{scores_html}
🔥 {interactive_count}互动 👍{like_count} ⭐{collect_count} 查看作品 ↗
''' return card_html # 获取数据(接口原始顺序) top_items = get_top_articles(data, max_items) latest_hot_items = data.get("latestHotArticles", [])[:10] # 主列表为空时的提示 no_articles_hint = '' if not top_items: no_articles_hint = '''

未查询到相关热门笔记,建议更换关键词重试。

''' cards_html = ''.join([generate_card(item, idx) for idx, item in enumerate(top_items)]) if top_items else '' # 推荐热门笔记区域(latestHotArticles,仅在有关键词文章时额外展示) latest_hot_html = '' if latest_hot_items: latest_cards = ''.join([generate_card(item, idx) for idx, item in enumerate(latest_hot_items)]) latest_hot_html = f'''

近期热门笔记推荐

{latest_cards}
''' time_range = f"近30天" if not start_date else f"从{start_date}起" html_content = f''' 小红书热门笔记数据分析报告

小红书热门笔记数据分析报告

关键词:{keyword} | 时间范围:{time_range}
共找到 {total} 条相关笔记
{no_articles_hint}
{cards_html}
{latest_hot_html}
数据来源:小红书热门笔记搜索,每日更新最新热门内容
备注:互动数据为入库快照,实时数据可能持续增长
''' return html_content def format_as_json(data: dict, max_items: int = 10): """ 格式化输出 JSON 格式(供智能体分析生成推荐理由) """ top_items = get_top_articles(data, max_items) keyword = data.get('keyword', '') is_full_site = not keyword or keyword.strip() == "" latest_hot_items = data.get("latestHotArticles", [])[:10] result = [] for item in top_items: note_id = item.get('id', '') item_data = { 'noteId': note_id, 'title': item.get('title', '') or item.get('desc', '')[:50], 'desc': item.get('desc', ''), 'authorId': item.get('authorId', ''), 'authorNickname': item.get('authorNickname', ''), 'authorFans': fuzzy_count(item.get('authorFans', 0)), 'createTime': item.get('createTime', ''), 'noteLink': item.get('shareInfoLink') or f"https://www.xiaohongshu.com/explore/{note_id}", 'authorLink': f"https://www.xiaohongshu.com/user/profile/{item.get('authorId', '')}" if item.get('authorId') else '', 'interactiveCount': fuzzy_count(item.get('interactiveCount', 0)), 'likedCount': fuzzy_count(item.get('likedCount', 0)), 'collectedCount': fuzzy_count(item.get('collectedCount', 0)), 'commentsCount': fuzzy_count(item.get('commentsCount', 0)), 'sharedCount': fuzzy_count(item.get('sharedCount', 0)), } # 有关键词时才输出评分字段 if not is_full_site: item_data['totalScore'] = item.get('totalScore', 0) item_data['relevanceScore'] = item.get('relevanceScore', 0) item_data['popularityScore'] = item.get('popularityScore', 0) item_data['recencyScore'] = item.get('recencyScore', 0) result.append(item_data) # 格式化推荐热门笔记(latestHotArticles,无评分字段) latest_hot_result = [] for item in latest_hot_items: note_id = item.get('id', '') latest_hot_result.append({ 'noteId': note_id, 'title': item.get('title', '') or item.get('desc', '')[:50], 'authorNickname': item.get('authorNickname', ''), 'authorFans': fuzzy_count(item.get('authorFans', 0)), 'createTime': item.get('createTime', ''), 'noteLink': item.get('shareInfoLink') or f"https://www.xiaohongshu.com/explore/{note_id}", 'authorLink': f"https://www.xiaohongshu.com/user/profile/{item.get('authorId', '')}" if item.get('authorId') else '', 'interactiveCount': fuzzy_count(item.get('interactiveCount', 0)), 'likedCount': fuzzy_count(item.get('likedCount', 0)), 'collectedCount': fuzzy_count(item.get('collectedCount', 0)), }) return { 'keyword': data.get('keyword', ''), 'total': data.get('total', 0), 'pageNum': data.get('pageNum', 1), 'pageSize': data.get('pageSize', 50), 'isFullSite': is_full_site, 'items': result, 'latestHotArticles': latest_hot_result, 'relatedSearches': data.get('relatedSearches', []) } def main(): """主函数""" parser = argparse.ArgumentParser(description='小红书热门笔记搜索工具') parser.add_argument('--keyword', required=True, help='搜索关键词') parser.add_argument('--max-items', type=int, default=10, help='最多展示数量(默认10条)') parser.add_argument('--output-format', choices=['json', 'html'], default='json', help='输出格式(默认json输出到stdout,html输出到文件)') parser.add_argument('--output-file', type=str, default=None, help='输出文件路径(默认:关键词_热门数据.html)') parser.add_argument('--start-date', type=str, default=None, help='开始日期,格式 yyyy-MM-dd') parser.add_argument('--end-date', type=str, default=None, help='结束日期,格式 yyyy-MM-dd') parser.add_argument('--page-num', type=int, default=1, help='页码(默认1)') parser.add_argument('--page-size', type=int, default=50, help='每页条数(默认50)') parser.add_argument('--debug', action='store_true', help='启用调试模式') parser.add_argument('--max-retries', type=int, default=3, help='最大重试次数(默认3次)') args = parser.parse_args() try: data = fetch_xhs_hot_notes( keyword=args.keyword, debug=args.debug, max_retries=args.max_retries, start_date=args.start_date, end_date=args.end_date, page_num=args.page_num, page_size=args.page_size ) # 生成 JSON 数据(始终输出到 stdout,供智能体读取) json_data = format_as_json(data, max_items=args.max_items) # 输出 JSON 到 stdout(智能体从此读取结构化数据) print(json.dumps(json_data, ensure_ascii=False, indent=2)) # 同时生成 HTML 文件 html_content = format_as_html(data, max_items=args.max_items, start_date=args.start_date) keyword_safe = args.keyword.replace('"', '').replace(' ', '_') or '全站热门' html_file = args.output_file or f"{keyword_safe}_热门数据.html" with open(html_file, 'w', encoding='utf-8') as f: f.write(html_content) # 统计信息输出到 stderr print(f"✓ HTML 结果已保存到: {html_file}", file=sys.stderr) print(f"✓ 关键词: {args.keyword}", file=sys.stderr) print(f"✓ 总条数: {json_data['total']} 条", file=sys.stderr) print(f"✓ 筛选结果: {len(json_data['items'])} 条", file=sys.stderr) print(f"✓ 推荐热门笔记: {len(json_data.get('latestHotArticles', []))} 条", file=sys.stderr) # 输出封面图URL供后续分析 cover_urls = get_cover_urls(data, max_items=5) if cover_urls: print(f"\n=== 封面图URL(用于风格分析)===", file=sys.stderr) for i, item in enumerate(cover_urls, 1): print(f"{i}. {item['title']}: {item['cover_url']}", file=sys.stderr) except Exception as e: print(f"❌ 错误: {str(e)}", file=sys.stderr) sys.exit(1) if __name__ == "__main__": main()