""" 跨平台餐厅匹配 — 基于原始 match_restaurants.py 保留: thefuzz 模糊匹配、多策略匹配、连锁店后缀处理、一致性评分 """ import math import re from typing import List, Dict from models import DianpingRestaurant, XiaohongshuPost, MatchedRestaurant # thefuzz 是可选依赖,没装时回退到简单匹配 try: from thefuzz import fuzz HAS_FUZZ = True except ImportError: HAS_FUZZ = False print("💡 提示: 安装 thefuzz 可获得更好的匹配效果: pip3 install thefuzz") # 连锁店常见后缀(来自原始 match_restaurants.py) CHAIN_SUFFIXES = re.compile( r'[((].{0,15}[))]|' r'(静安|徐汇|浦东|朝阳|海淀|南山|福田|天河|武侯|锦江|南油|华强|科技园)' r'(店|分店|旗舰店|总店)?$' ) def normalize_name(name: str) -> str: """标准化餐厅名:去除分店后缀、空格、特殊符号""" name = name.strip() name = CHAIN_SUFFIXES.sub('', name) name = re.sub(r'[\s·・\-—]+', '', name) return name def calculate_similarity(dp_name: str, xhs_name: str) -> float: """ 计算两个店名的相似度(0~1) 使用多策略匹配(来自原始 match_restaurants.py._calculate_similarity) """ dp_norm = normalize_name(dp_name) xhs_norm = normalize_name(xhs_name) if not dp_norm or not xhs_norm: return 0.0 # 策略1: 完全匹配 if dp_norm == xhs_norm: return 1.0 if HAS_FUZZ: # 策略2: 精确比率 exact_score = fuzz.ratio(dp_norm, xhs_norm) / 100 # 策略3: 部分匹配 partial_score = fuzz.partial_ratio(dp_norm, xhs_norm) / 100 # 策略4: Token 排序 token_score = fuzz.token_sort_ratio(dp_norm, xhs_norm) / 100 else: # 简单 Jaccard 相似度作为回退 s1, s2 = set(dp_norm), set(xhs_norm) exact_score = len(s1 & s2) / len(s1 | s2) if (s1 | s2) else 0 partial_score = 0 token_score = 0 # 策略5: 包含关系 containment_score = 0.0 if dp_norm in xhs_norm or xhs_norm in dp_norm: shorter = min(len(dp_norm), len(xhs_norm)) longer = max(len(dp_norm), len(xhs_norm)) containment_score = shorter / longer if longer > 0 else 0.0 # 取最优策略 return max( exact_score, partial_score * 0.90, token_score * 0.85, containment_score * 0.88, ) def normalize_engagement(xhs_post: XiaohongshuPost) -> float: """ 将小红书互动量归一化到 0-5 评分(来自原始 match_restaurants.py) 使用对数归一化避免极端值影响 """ engagement = ( xhs_post.likes * 1.0 + xhs_post.saves * 2.0 + xhs_post.comments * 1.5 ) if engagement <= 0: return 0.0 # log1p(5000) ≈ 8.52 作为"满分"参考点 normalized = math.log1p(engagement) / math.log1p(5000) * 5 return max(0.0, min(5.0, normalized)) def calculate_consistency( dp_rating: float, xhs_engagement_normalized: float, xhs_sentiment: float, ) -> float: """ 计算两平台一致性评分 0~1(来自原始 match_restaurants.py) """ dp_rating = max(0.0, min(5.0, dp_rating)) xhs_engagement_normalized = max(0.0, min(5.0, xhs_engagement_normalized)) xhs_sentiment = max(-1.0, min(1.0, xhs_sentiment)) # 评分相关性 rating_diff = abs(dp_rating - xhs_engagement_normalized) rating_correlation = max(0.0, 1.0 - (rating_diff / 2.5)) # 情感一致性 sentiment_normalized = (xhs_sentiment + 1) / 2 # -1~1 → 0~1 sentiment_alignment = sentiment_normalized return max(0.0, min(1.0, rating_correlation * 0.6 + sentiment_alignment * 0.4)) def match_and_score( dp_restaurants: List[DianpingRestaurant], xhs_posts: List[XiaohongshuPost], similarity_threshold: float = 0.55, ) -> List[MatchedRestaurant]: """ 跨平台匹配并计算一致性评分 Args: dp_restaurants: 大众点评数据 xhs_posts: 小红书数据 similarity_threshold: 匹配阈值 Returns: 匹配结果列表,按一致性排序 """ matches = [] used_xhs = set() for dp in dp_restaurants: best_idx, best_score = None, 0 for idx, xhs in enumerate(xhs_posts): if idx in used_xhs: continue score = calculate_similarity(dp.name, xhs.restaurant_name) if score > best_score and score >= similarity_threshold: best_score = score best_idx = idx if best_idx is not None: xhs = xhs_posts[best_idx] used_xhs.add(best_idx) # 计算一致性 xhs_engagement_norm = normalize_engagement(xhs) consistency = calculate_consistency( dp.rating, xhs_engagement_norm, xhs.sentiment_score ) matches.append(MatchedRestaurant( name=dp.name, dianping_data=dp, xhs_data=xhs, similarity_score=best_score, consistency_score=consistency, )) return matches