""" 评分和过滤模块 - 基于关键词权重 + 规则引擎 不调用外部 AI API,纯本地规则评分 """ import re import logging logger = logging.getLogger(__name__) # 高权重关键词(出现即加分) HIGH_WEIGHT_KEYWORDS = { # AI / 算力相关 "英伟达": 3, "NVIDIA": 3, "AMD": 2, "算力": 2, "AI": 2, "光模块": 2, "存储": 1, "芯片": 2, "GPU": 2, "大模型": 2, "ChatGPT": 2, "OpenAI": 2, "Anthropic": 2, "DeepSeek": 2, # 中国科技公司 "华为": 2, "腾讯": 2, "阿里": 2, "字节": 2, "小米": 2, "百度": 1, "京东": 1, "美团": 1, "拼多多": 1, "比亚迪": 2, # 重磅事件 "收购": 2, "突破": 2, "泄露": 3, "内幕": 3, "传闻": 2, "重磅": 3, "突发": 3, "暴涨": 2, "暴跌": 2, "崩盘": 3, "熔断": 3, "制裁": 2, "禁令": 2, "封锁": 2, # 股市相关 "A股": 1, "港股": 1, "美股": 1, "涨停": 2, "跌停": 2, "IPO": 2, "上市": 1, "退市": 2, "回购": 1, # 宏观 "降息": 2, "加息": 2, "美联储": 2, "央行": 1, "GDP": 1, "CPI": 1, "非农": 2, "失业率": 1, } # 低价值关键词(出现则减分) LOW_VALUE_PATTERNS = [ r"广告", r"推广", r"优惠券", r"免费领", r"点击.*查看", r"扫码", r"关注.*公众号", ] def score_news(news_item: dict, user_keywords: list = None) -> int: """ 对单条新闻评分,返回 0-10 分 评分规则: - 基础分 3 分 - 匹配高权重关键词加分 - 匹配用户自定义关键词加分 - 信息源标记为重要加分 - 匹配低价值模式减分 - 最终分数限制在 0-10 """ title = news_item.get("title", "") if not title: return 0 score = 3 # 基础分 # 1. 高权重关键词匹配 for keyword, weight in HIGH_WEIGHT_KEYWORDS.items(): if keyword.lower() in title.lower(): score += weight # 2. 用户自定义关键词匹配 if user_keywords: for kw in user_keywords: if kw.lower() in title.lower(): score += 1 # 3. 信息源标记为重要 if news_item.get("important"): score += 2 # 4. 低价值内容减分 for pattern in LOW_VALUE_PATTERNS: if re.search(pattern, title): score -= 3 # 5. 标题太短可能是无效内容 if len(title) < 10: score -= 2 return max(0, min(10, score)) def is_similar(title1: str, title2: str, threshold: float = 0.7) -> bool: """ 简单的标题相似度判断 使用字符级别的 Jaccard 相似度 """ if not title1 or not title2: return False # 去除标点和空格 clean1 = re.sub(r"[^\w]", "", title1) clean2 = re.sub(r"[^\w]", "", title2) if not clean1 or not clean2: return False # 2-gram 集合 set1 = set(clean1[i:i+2] for i in range(len(clean1)-1)) set2 = set(clean2[i:i+2] for i in range(len(clean2)-1)) if not set1 or not set2: return clean1 == clean2 intersection = set1 & set2 union = set1 | set2 return len(intersection) / len(union) >= threshold def dedup_news(news_list: list) -> list: """去重:基于标题相似度,保留最早的一条""" result = [] for item in news_list: is_dup = False for existing in result: if is_similar(item.get("title", ""), existing.get("title", "")): is_dup = True break if not is_dup: result.append(item) return result def score_and_filter(news_list: list, user_keywords: list = None) -> list: """批量评分并过滤,返回带评分的新闻列表""" for item in news_list: item["score"] = score_news(item, user_keywords) return news_list