#!/usr/bin/env python3 """ 小红书热门笔记搜索脚本(支持 HTML 卡片布局输出) 基于红狐数据API,支持关键词搜索、分页、时间筛选 """ import sys import os import argparse import json import urllib.request import urllib.error def parse_count(value): """解析数量,支持 "17w+"、"1.5w" 格式""" if value is None: return 0 if isinstance(value, int): return value value_str = str(value).replace('+', '').replace(',', '').strip() # 处理 "w" 或 "W"(万) if 'w' in value_str.lower(): value_str = value_str.lower().replace('w', '') try: return int(float(value_str) * 10000) except: return 0 try: return int(float(value_str)) except: return 0 def fuzzy_count(value): """对5000+的互动数做模糊处理,5000以下保留原始数值""" if value is None: return '--' num = parse_count(value) if num <= 0: return '--' if num < 5000: return str(num) if num < 10000: return '5000+' # 1万以上:以万为单位,向下取整 wan = num // 10000 return f'{wan}w+' def get_api_key(): """从环境变量 REDFOX_API_KEY 获取 API Key,未配置时报错退出""" api_key = os.environ.get("REDFOX_API_KEY", "") if not api_key: print("❌ 未找到 API Key,请配置环境变量 REDFOX_API_KEY。", file=sys.stderr) print(" 示例:export REDFOX_API_KEY=your_api_key_here", file=sys.stderr) sys.exit(1) return api_key def fetch_xhs_hot_notes(keyword: str, debug: bool = False, max_retries: int = 3, start_date: str = None, end_date: str = None, page_num: int = 1, page_size: int = 50): """调用接口获取小红书热门笔记数据""" # API Key(从环境变量 REDFOX_API_KEY 读取,支持 shell 配置文件回退) api_key = get_api_key() # 构建请求 url = "https://redfox.hk/story/api/xhs/search/search" headers = { "Content-Type": "application/json", "X-API-KEY": api_key } payload = { "keyword": keyword, "pageNum": page_num, "pageSize": page_size, "startDate": start_date or "", "endDate": end_date or "", "source": "小红书笔记创作-ClawHub" } last_error = None for attempt in range(max_retries): try: if debug: print(f"\n=== DEBUG: 第 {attempt + 1} 次尝试 ===", file=sys.stderr) print(f"请求参数: {json.dumps(payload, ensure_ascii=False)}", file=sys.stderr) body = json.dumps(payload, ensure_ascii=False).encode("utf-8") req = urllib.request.Request(url, data=body, headers=headers, method="POST") with urllib.request.urlopen(req, timeout=30) as resp: status_code = resp.status resp_bytes = resp.read() resp_text = resp_bytes.decode("utf-8") if debug: print(f"状态码: {status_code}", file=sys.stderr) print(f"响应长度: {len(resp_text)} 字节", file=sys.stderr) if status_code >= 400: raise Exception(f"HTTP请求失败: 状态码 {status_code}, {resp_text[:200]}") data = json.loads(resp_text) # 检查返回码 if data.get("code") != 2000: raise Exception(f"API 错误: {data.get('msg', '未知错误')}") result_data = data.get("data", {}) if debug: print("=== DEBUG: API 返回的 data 字段键 ===", file=sys.stderr) print(json.dumps(list(result_data.keys()), ensure_ascii=False, indent=2), file=sys.stderr) print(f"总条数: {result_data.get('total', 0)}", file=sys.stderr) articles = result_data.get("articles", []) return { "keyword": result_data.get("keyword", keyword), "articles": articles, "total": len(articles), "pageNum": result_data.get("pageNum", page_num), "pageSize": result_data.get("pageSize", page_size), "hotTopics": result_data.get("hotTopics", []), "relatedSearches": result_data.get("relatedSearches", []), "latestHotArticles": result_data.get("latestHotArticles", []) } except urllib.error.URLError as e: last_error = f"请求失败: {str(e)}" if debug: print(f" 错误: {str(e)[:100]}", file=sys.stderr) import time if attempt < max_retries - 1: time.sleep(2 ** attempt) continue except Exception as e: last_error = str(e) if debug: print(f" 错误: {str(e)[:100]}", file=sys.stderr) import time if attempt < max_retries - 1: time.sleep(2 ** attempt) continue raise Exception(f"{last_error}(已尝试 {max_retries} 次)") def get_cover_urls(data, max_items=10): """提取所有封面图URL""" urls = [] articles = data.get("articles", [])[:max_items] for item in articles: cover_url = item.get('cover', '') note_id = item.get('id', '') title = (item.get('title', '') or item.get('desc', ''))[:30] if cover_url and note_id: urls.append({ 'title': title, 'note_id': note_id, 'cover_url': cover_url, 'link': item.get('shareInfoLink', f"https://www.xiaohongshu.com/explore/{note_id}") }) return urls def get_top_articles(data, max_items=10): """ 获取文章列表(按接口原始返回顺序,截取前 max_items 条) """ articles = data.get("articles", [])[:max_items] return articles def format_as_html(data: dict, max_items: int = 10, start_date: str = None): """ 格式化输出热门笔记数据(HTML 卡片布局) """ from datetime import datetime keyword = data.get("keyword", "") total = data.get("total", 0) is_full_site = not keyword or keyword.strip() == "" def process_title(item): """处理标题""" title = item.get('title', '') if not title or title.strip() == '': desc = item.get('desc', '') if desc: title = desc.replace('\n', ' ').replace('\r', ' ').strip()[:30] if len(desc) > 30: title = title + '...' if not title or title.strip() == '': title = '无标题' title = title.replace('<', '<').replace('>', '>').replace('"', '"') return title def format_time(item): """格式化发布时间""" create_time = item.get('createTime', '') if create_time: try: month = int(create_time[5:7]) day = int(create_time[8:10]) return f"{month}月{day}日" except: pass return '--' def generate_card(item, idx): """生成单个卡片 HTML""" note_id = item.get('id', '') author_id = item.get('authorId', '') author_name = item.get('authorNickname', '未知') fans = item.get('authorFans', 0) title = process_title(item) pub_time = format_time(item) interactive_count = fuzzy_count(item.get('interactiveCount', 0)) like_count = fuzzy_count(item.get('likedCount', 0)) collect_count = fuzzy_count(item.get('collectedCount', 0)) # 作品链接 note_link = item.get('shareInfoLink') or f"https://www.xiaohongshu.com/explore/{note_id}" # 作者主页链接 author_link = f"https://www.xiaohongshu.com/user/profile/{author_id}" if author_id else "#" relevance_score = item.get('relevanceScore', 0) popularity_score = item.get('popularityScore', 0) recency_score = item.get('recencyScore', 0) total_score = item.get('totalScore', 0) # 评分标签(全站热门时不展示) scores_html = '' if not is_full_site: scores_html = f'''
未查询到相关热门笔记,建议更换关键词重试。