Files
vps-management-bot/projects/news-bot/scorer.py
2026-03-21 01:10:53 +08:00

123 lines
3.7 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
评分和过滤模块 - 基于关键词权重 + 规则引擎
不调用外部 AI API纯本地规则评分
"""
import re
import logging
logger = logging.getLogger(__name__)
# 高权重关键词(出现即加分)
HIGH_WEIGHT_KEYWORDS = {
# AI / 算力相关
"英伟达": 3, "NVIDIA": 3, "AMD": 2, "算力": 2, "AI": 2,
"光模块": 2, "存储": 1, "芯片": 2, "GPU": 2, "大模型": 2,
"ChatGPT": 2, "OpenAI": 2, "Anthropic": 2, "DeepSeek": 2,
# 中国科技公司
"华为": 2, "腾讯": 2, "阿里": 2, "字节": 2, "小米": 2,
"百度": 1, "京东": 1, "美团": 1, "拼多多": 1, "比亚迪": 2,
# 重磅事件
"收购": 2, "突破": 2, "泄露": 3, "内幕": 3, "传闻": 2,
"重磅": 3, "突发": 3, "暴涨": 2, "暴跌": 2, "崩盘": 3,
"熔断": 3, "制裁": 2, "禁令": 2, "封锁": 2,
# 股市相关
"A股": 1, "港股": 1, "美股": 1, "涨停": 2, "跌停": 2,
"IPO": 2, "上市": 1, "退市": 2, "回购": 1,
# 宏观
"降息": 2, "加息": 2, "美联储": 2, "央行": 1, "GDP": 1,
"CPI": 1, "非农": 2, "失业率": 1,
}
# 低价值关键词(出现则减分)
LOW_VALUE_PATTERNS = [
r"广告", r"推广", r"优惠券", r"免费领",
r"点击.*查看", r"扫码", r"关注.*公众号",
]
def score_news(news_item: dict, user_keywords: list = None) -> int:
"""
对单条新闻评分,返回 0-10 分
评分规则:
- 基础分 3 分
- 匹配高权重关键词加分
- 匹配用户自定义关键词加分
- 信息源标记为重要加分
- 匹配低价值模式减分
- 最终分数限制在 0-10
"""
title = news_item.get("title", "")
if not title:
return 0
score = 3 # 基础分
# 1. 高权重关键词匹配
for keyword, weight in HIGH_WEIGHT_KEYWORDS.items():
if keyword.lower() in title.lower():
score += weight
# 2. 用户自定义关键词匹配
if user_keywords:
for kw in user_keywords:
if kw.lower() in title.lower():
score += 1
# 3. 信息源标记为重要
if news_item.get("important"):
score += 2
# 4. 低价值内容减分
for pattern in LOW_VALUE_PATTERNS:
if re.search(pattern, title):
score -= 3
# 5. 标题太短可能是无效内容
if len(title) < 10:
score -= 2
return max(0, min(10, score))
def is_similar(title1: str, title2: str, threshold: float = 0.7) -> bool:
"""
简单的标题相似度判断
使用字符级别的 Jaccard 相似度
"""
if not title1 or not title2:
return False
# 去除标点和空格
clean1 = re.sub(r"[^\w]", "", title1)
clean2 = re.sub(r"[^\w]", "", title2)
if not clean1 or not clean2:
return False
# 2-gram 集合
set1 = set(clean1[i:i+2] for i in range(len(clean1)-1))
set2 = set(clean2[i:i+2] for i in range(len(clean2)-1))
if not set1 or not set2:
return clean1 == clean2
intersection = set1 & set2
union = set1 | set2
return len(intersection) / len(union) >= threshold
def dedup_news(news_list: list) -> list:
"""去重:基于标题相似度,保留最早的一条"""
result = []
for item in news_list:
is_dup = False
for existing in result:
if is_similar(item.get("title", ""), existing.get("title", "")):
is_dup = True
break
if not is_dup:
result.append(item)
return result
def score_and_filter(news_list: list, user_keywords: list = None) -> list:
"""批量评分并过滤,返回带评分的新闻列表"""
for item in news_list:
item["score"] = score_news(item, user_keywords)
return news_list