Files
vps-management-bot/projects/news-bot/sources.py

375 lines
14 KiB
Python
Raw Normal View History

2026-03-21 01:10:53 +08:00
"""
信息源抓取模块 - 支持金十华尔街见闻36新浪财经Google NewsFinvizTechCrunch
"""
import re
import time
import hashlib
import logging
import httpx
import xml.etree.ElementTree as ET
from datetime import datetime
from email.utils import parsedate_to_datetime
logger = logging.getLogger(__name__)
# 请求超时设置
TIMEOUT = 10
# 翻译缓存,避免重复翻译
_translate_cache = {}
async def translate_to_zh(text: str) -> str:
"""用 Google Translate 免费接口将英文翻译为中文"""
if not text:
return text
# 检测是否主要是中文,是则跳过
zh_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
if zh_count > len(text) * 0.3:
return text
# 查缓存
cache_key = text[:100]
if cache_key in _translate_cache:
return _translate_cache[cache_key]
try:
async with httpx.AsyncClient(timeout=8) as client:
resp = await client.get(
"https://translate.googleapis.com/translate_a/single",
params={"client": "gtx", "sl": "en", "tl": "zh-CN", "dt": "t", "q": text[:500]},
)
result = resp.json()
translated = "".join(seg[0] for seg in result[0] if seg[0])
_translate_cache[cache_key] = translated
# 限制缓存大小
if len(_translate_cache) > 500:
keys = list(_translate_cache.keys())[:200]
for k in keys:
del _translate_cache[k]
return translated
except Exception as e:
logger.error(f"翻译失败: {e}")
return text
def _make_id(source: str, title: str) -> str:
"""生成新闻唯一 ID"""
raw = f"{source}:{title}"
return hashlib.md5(raw.encode()).hexdigest()[:16]
def _safe_ts(val, default=0) -> int:
"""安全转换时间戳"""
try:
if isinstance(val, (int, float)):
# 如果是毫秒级时间戳,转为秒
return int(val) if val < 2000000000 else int(val / 1000)
if isinstance(val, str):
# 尝试解析常见格式
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"]:
try:
return int(datetime.strptime(val[:19], fmt[:len(val)+2]).timestamp())
except ValueError:
continue
except Exception:
pass
return default or int(time.time())
async def fetch_jin10() -> list:
"""抓取金十数据快讯"""
url = "https://flash-api.jin10.com/get_flash_list"
params = {"channel": "-8200", "vip": "1", "max_time": "", "t": "1"}
headers = {"x-app-id": "bVBF4FyRTn5NJF5n", "x-version": "1.0.0"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params, headers=headers)
data = resp.json().get("data", [])
for item in data:
# 金十的 data 字段可能是 dict 或直接文本
content = ""
if isinstance(item.get("data"), dict):
content = item["data"].get("content", "") or item["data"].get("title", "")
elif isinstance(item.get("data"), str):
content = item["data"]
# 备用:用 content 字段
if not content:
content = item.get("content", "")
if not content:
continue
# 清理 HTML 标签
import re
content = re.sub(r"<[^>]+>", "", content).strip()
if not content:
continue
ts = _safe_ts(item.get("time", ""))
news_id = item.get("id", "")
news_url = f"https://www.jin10.com/flash_detail/{news_id}.html" if news_id else ""
results.append({
"id": _make_id("jin10", content[:80]),
"source": "jin10",
"source_name": "金十数据",
"title": content[:200],
"url": news_url,
"timestamp": ts,
"time_str": item.get("time", ""),
"important": item.get("important", 0) == 1,
})
except Exception as e:
logger.error(f"金十数据抓取失败: {e}")
return results
async def fetch_wallstreet() -> list:
"""抓取华尔街见闻快讯"""
url = "https://api-one.wallstcn.com/apiv1/content/lives"
params = {"channel": "global-channel", "limit": "20"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params)
items = resp.json().get("data", {}).get("items", [])
for item in items:
title = item.get("content_text", "") or item.get("title", "")
if not title:
continue
import re
title = re.sub(r"<[^>]+>", "", title).strip()
if not title:
continue
ts = _safe_ts(item.get("display_time", 0))
news_url = item.get("uri", "") or ""
results.append({
"id": _make_id("wallstreet", title[:80]),
"source": "wallstreet",
"source_name": "华尔街见闻",
"title": title[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": item.get("is_important", False),
})
except Exception as e:
logger.error(f"华尔街见闻抓取失败: {e}")
return results
async def fetch_kr36() -> list:
"""抓取36氪快讯"""
url = "https://36kr.com/api/newsflash"
params = {"per_page": "20"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params)
items = resp.json().get("data", {}).get("items", [])
for item in items:
title = item.get("title", "") or item.get("entity_name", "")
if not title:
continue
ts = _safe_ts(item.get("published_at", ""))
news_id = item.get("id", "")
news_url = f"https://36kr.com/newsflashes/{news_id}" if news_id else ""
results.append({
"id": _make_id("kr36", title[:80]),
"source": "kr36",
"source_name": "36氪",
"title": title[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"36氪抓取失败: {e}")
return results
async def fetch_sina() -> list:
"""抓取新浪财经快讯"""
url = "https://feed.mix.sina.com.cn/api/roll/get"
params = {"pageid": "153", "lid": "2516", "k": "", "num": "20", "page": "1"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params)
data = resp.json().get("result", {}).get("data", [])
for item in data:
title = item.get("title", "")
if not title:
continue
import re
title = re.sub(r"<[^>]+>", "", title).strip()
if not title:
continue
ts = _safe_ts(item.get("ctime", item.get("createtime", 0)))
news_url = item.get("url", "") or item.get("link", "") or ""
results.append({
"id": _make_id("sina", title[:80]),
"source": "sina",
"source_name": "新浪财经",
"title": title[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"新浪财经抓取失败: {e}")
return results
async def fetch_google_news() -> list:
"""抓取 Google News 科技频道 RSS聚合路透社/彭博社等)"""
url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB"
params = {"hl": "en-US", "gl": "US", "ceid": "US:en"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params, headers={"User-Agent": "Mozilla/5.0"})
root = ET.fromstring(resp.text)
for item in root.findall(".//item")[:20]:
title = item.findtext("title", "").strip()
if not title:
continue
# 去掉来源后缀 " - Bloomberg.com" 等
source_tag = title.rsplit(" - ", 1)[-1] if " - " in title else ""
news_url = item.findtext("link", "").strip()
ts = 0
pub = item.findtext("pubDate", "")
if pub:
try:
ts = int(parsedate_to_datetime(pub).timestamp())
except Exception:
ts = int(time.time())
title_zh = await translate_to_zh(title)
results.append({
"id": _make_id("google", title[:80]),
"source": "google",
"source_name": f"Google News ({source_tag})" if source_tag else "Google News",
"title": title_zh[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"Google News 抓取失败: {e}")
return results
async def fetch_finviz() -> list:
"""抓取 Finviz 美股财经新闻"""
url = "https://finviz.com/news.ashx"
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
# 解析 HTML 中的新闻标题和链接
matches = re.findall(r'class="nn-tab-link"[^>]*href="([^"]*)"[^>]*>([^<]+)', resp.text)
if not matches:
# 备用:只提取标题
titles = re.findall(r'class="nn-tab-link"[^>]*>([^<]+)', resp.text)
matches = [("", t) for t in titles]
for link, title in matches[:20]:
title = title.strip()
if not title:
continue
title_zh = await translate_to_zh(title)
results.append({
"id": _make_id("finviz", title[:80]),
"source": "finviz",
"source_name": "Finviz",
"title": title_zh[:200],
"url": link or "",
"timestamp": int(time.time()),
"time_str": datetime.now().strftime("%H:%M:%S"),
"important": False,
})
except Exception as e:
logger.error(f"Finviz 抓取失败: {e}")
return results
async def fetch_techcrunch() -> list:
"""抓取 TechCrunch RSS 科技新闻"""
url = "https://techcrunch.com/feed/"
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
root = ET.fromstring(resp.text)
for item in root.findall(".//item")[:20]:
title = item.findtext("title", "").strip()
if not title:
continue
news_url = item.findtext("link", "").strip()
ts = 0
pub = item.findtext("pubDate", "")
if pub:
try:
ts = int(parsedate_to_datetime(pub).timestamp())
except Exception:
ts = int(time.time())
title_zh = await translate_to_zh(title)
results.append({
"id": _make_id("techcrunch", title[:80]),
"source": "techcrunch",
"source_name": "TechCrunch",
"title": title_zh[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"TechCrunch 抓取失败: {e}")
return results
# 源名称 → 抓取函数映射
SOURCE_FETCHERS = {
"jin10": fetch_jin10,
"wallstreet": fetch_wallstreet,
"kr36": fetch_kr36,
"sina": fetch_sina,
"google": fetch_google_news,
"finviz": fetch_finviz,
"techcrunch": fetch_techcrunch,
}
SOURCE_NAMES = {
"jin10": "金十数据",
"wallstreet": "华尔街见闻",
"kr36": "36氪",
"sina": "新浪财经",
"google": "Google News",
"finviz": "Finviz",
"techcrunch": "TechCrunch",
}
async def fetch_all(enabled_sources: dict = None) -> list:
"""抓取所有启用的信息源,返回合并后的新闻列表"""
import asyncio
if enabled_sources is None:
enabled_sources = {k: True for k in SOURCE_FETCHERS}
tasks = []
for name, fetcher in SOURCE_FETCHERS.items():
if enabled_sources.get(name, True):
tasks.append(fetcher())
all_news = []
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
logger.error(f"抓取异常: {result}")
continue
if isinstance(result, list):
all_news.extend(result)
# 按时间排序,最新的在前
all_news.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
return all_news