123 lines
3.7 KiB
Python
123 lines
3.7 KiB
Python
"""
|
||
评分和过滤模块 - 基于关键词权重 + 规则引擎
|
||
不调用外部 AI API,纯本地规则评分
|
||
"""
|
||
import re
|
||
import logging
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 高权重关键词(出现即加分)
|
||
HIGH_WEIGHT_KEYWORDS = {
|
||
# AI / 算力相关
|
||
"英伟达": 3, "NVIDIA": 3, "AMD": 2, "算力": 2, "AI": 2,
|
||
"光模块": 2, "存储": 1, "芯片": 2, "GPU": 2, "大模型": 2,
|
||
"ChatGPT": 2, "OpenAI": 2, "Anthropic": 2, "DeepSeek": 2,
|
||
# 中国科技公司
|
||
"华为": 2, "腾讯": 2, "阿里": 2, "字节": 2, "小米": 2,
|
||
"百度": 1, "京东": 1, "美团": 1, "拼多多": 1, "比亚迪": 2,
|
||
# 重磅事件
|
||
"收购": 2, "突破": 2, "泄露": 3, "内幕": 3, "传闻": 2,
|
||
"重磅": 3, "突发": 3, "暴涨": 2, "暴跌": 2, "崩盘": 3,
|
||
"熔断": 3, "制裁": 2, "禁令": 2, "封锁": 2,
|
||
# 股市相关
|
||
"A股": 1, "港股": 1, "美股": 1, "涨停": 2, "跌停": 2,
|
||
"IPO": 2, "上市": 1, "退市": 2, "回购": 1,
|
||
# 宏观
|
||
"降息": 2, "加息": 2, "美联储": 2, "央行": 1, "GDP": 1,
|
||
"CPI": 1, "非农": 2, "失业率": 1,
|
||
}
|
||
|
||
# 低价值关键词(出现则减分)
|
||
LOW_VALUE_PATTERNS = [
|
||
r"广告", r"推广", r"优惠券", r"免费领",
|
||
r"点击.*查看", r"扫码", r"关注.*公众号",
|
||
]
|
||
|
||
|
||
def score_news(news_item: dict, user_keywords: list = None) -> int:
|
||
"""
|
||
对单条新闻评分,返回 0-10 分
|
||
评分规则:
|
||
- 基础分 3 分
|
||
- 匹配高权重关键词加分
|
||
- 匹配用户自定义关键词加分
|
||
- 信息源标记为重要加分
|
||
- 匹配低价值模式减分
|
||
- 最终分数限制在 0-10
|
||
"""
|
||
title = news_item.get("title", "")
|
||
if not title:
|
||
return 0
|
||
|
||
score = 3 # 基础分
|
||
|
||
# 1. 高权重关键词匹配
|
||
for keyword, weight in HIGH_WEIGHT_KEYWORDS.items():
|
||
if keyword.lower() in title.lower():
|
||
score += weight
|
||
|
||
# 2. 用户自定义关键词匹配
|
||
if user_keywords:
|
||
for kw in user_keywords:
|
||
if kw.lower() in title.lower():
|
||
score += 1
|
||
|
||
# 3. 信息源标记为重要
|
||
if news_item.get("important"):
|
||
score += 2
|
||
|
||
# 4. 低价值内容减分
|
||
for pattern in LOW_VALUE_PATTERNS:
|
||
if re.search(pattern, title):
|
||
score -= 3
|
||
|
||
# 5. 标题太短可能是无效内容
|
||
if len(title) < 10:
|
||
score -= 2
|
||
|
||
return max(0, min(10, score))
|
||
|
||
|
||
def is_similar(title1: str, title2: str, threshold: float = 0.7) -> bool:
|
||
"""
|
||
简单的标题相似度判断
|
||
使用字符级别的 Jaccard 相似度
|
||
"""
|
||
if not title1 or not title2:
|
||
return False
|
||
# 去除标点和空格
|
||
clean1 = re.sub(r"[^\w]", "", title1)
|
||
clean2 = re.sub(r"[^\w]", "", title2)
|
||
if not clean1 or not clean2:
|
||
return False
|
||
# 2-gram 集合
|
||
set1 = set(clean1[i:i+2] for i in range(len(clean1)-1))
|
||
set2 = set(clean2[i:i+2] for i in range(len(clean2)-1))
|
||
if not set1 or not set2:
|
||
return clean1 == clean2
|
||
intersection = set1 & set2
|
||
union = set1 | set2
|
||
return len(intersection) / len(union) >= threshold
|
||
|
||
|
||
def dedup_news(news_list: list) -> list:
|
||
"""去重:基于标题相似度,保留最早的一条"""
|
||
result = []
|
||
for item in news_list:
|
||
is_dup = False
|
||
for existing in result:
|
||
if is_similar(item.get("title", ""), existing.get("title", "")):
|
||
is_dup = True
|
||
break
|
||
if not is_dup:
|
||
result.append(item)
|
||
return result
|
||
|
||
|
||
def score_and_filter(news_list: list, user_keywords: list = None) -> list:
|
||
"""批量评分并过滤,返回带评分的新闻列表"""
|
||
for item in news_list:
|
||
item["score"] = score_news(item, user_keywords)
|
||
return news_list
|