Files
vps-management-bot/projects/news-bot/scorer.py

123 lines
3.7 KiB
Python
Raw Normal View History

2026-03-21 01:10:53 +08:00
"""
评分和过滤模块 - 基于关键词权重 + 规则引擎
不调用外部 AI API纯本地规则评分
"""
import re
import logging
logger = logging.getLogger(__name__)
# 高权重关键词(出现即加分)
HIGH_WEIGHT_KEYWORDS = {
# AI / 算力相关
"英伟达": 3, "NVIDIA": 3, "AMD": 2, "算力": 2, "AI": 2,
"光模块": 2, "存储": 1, "芯片": 2, "GPU": 2, "大模型": 2,
"ChatGPT": 2, "OpenAI": 2, "Anthropic": 2, "DeepSeek": 2,
# 中国科技公司
"华为": 2, "腾讯": 2, "阿里": 2, "字节": 2, "小米": 2,
"百度": 1, "京东": 1, "美团": 1, "拼多多": 1, "比亚迪": 2,
# 重磅事件
"收购": 2, "突破": 2, "泄露": 3, "内幕": 3, "传闻": 2,
"重磅": 3, "突发": 3, "暴涨": 2, "暴跌": 2, "崩盘": 3,
"熔断": 3, "制裁": 2, "禁令": 2, "封锁": 2,
# 股市相关
"A股": 1, "港股": 1, "美股": 1, "涨停": 2, "跌停": 2,
"IPO": 2, "上市": 1, "退市": 2, "回购": 1,
# 宏观
"降息": 2, "加息": 2, "美联储": 2, "央行": 1, "GDP": 1,
"CPI": 1, "非农": 2, "失业率": 1,
}
# 低价值关键词(出现则减分)
LOW_VALUE_PATTERNS = [
r"广告", r"推广", r"优惠券", r"免费领",
r"点击.*查看", r"扫码", r"关注.*公众号",
]
def score_news(news_item: dict, user_keywords: list = None) -> int:
"""
对单条新闻评分返回 0-10
评分规则
- 基础分 3
- 匹配高权重关键词加分
- 匹配用户自定义关键词加分
- 信息源标记为重要加分
- 匹配低价值模式减分
- 最终分数限制在 0-10
"""
title = news_item.get("title", "")
if not title:
return 0
score = 3 # 基础分
# 1. 高权重关键词匹配
for keyword, weight in HIGH_WEIGHT_KEYWORDS.items():
if keyword.lower() in title.lower():
score += weight
# 2. 用户自定义关键词匹配
if user_keywords:
for kw in user_keywords:
if kw.lower() in title.lower():
score += 1
# 3. 信息源标记为重要
if news_item.get("important"):
score += 2
# 4. 低价值内容减分
for pattern in LOW_VALUE_PATTERNS:
if re.search(pattern, title):
score -= 3
# 5. 标题太短可能是无效内容
if len(title) < 10:
score -= 2
return max(0, min(10, score))
def is_similar(title1: str, title2: str, threshold: float = 0.7) -> bool:
"""
简单的标题相似度判断
使用字符级别的 Jaccard 相似度
"""
if not title1 or not title2:
return False
# 去除标点和空格
clean1 = re.sub(r"[^\w]", "", title1)
clean2 = re.sub(r"[^\w]", "", title2)
if not clean1 or not clean2:
return False
# 2-gram 集合
set1 = set(clean1[i:i+2] for i in range(len(clean1)-1))
set2 = set(clean2[i:i+2] for i in range(len(clean2)-1))
if not set1 or not set2:
return clean1 == clean2
intersection = set1 & set2
union = set1 | set2
return len(intersection) / len(union) >= threshold
def dedup_news(news_list: list) -> list:
"""去重:基于标题相似度,保留最早的一条"""
result = []
for item in news_list:
is_dup = False
for existing in result:
if is_similar(item.get("title", ""), existing.get("title", "")):
is_dup = True
break
if not is_dup:
result.append(item)
return result
def score_and_filter(news_list: list, user_keywords: list = None) -> list:
"""批量评分并过滤,返回带评分的新闻列表"""
for item in news_list:
item["score"] = score_news(item, user_keywords)
return news_list