Files
vps-management-bot/projects/news-bot/sources.py
2026-03-21 01:10:53 +08:00

375 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
信息源抓取模块 - 支持金十、华尔街见闻、36氪、新浪财经、Google News、Finviz、TechCrunch
"""
import re
import time
import hashlib
import logging
import httpx
import xml.etree.ElementTree as ET
from datetime import datetime
from email.utils import parsedate_to_datetime
logger = logging.getLogger(__name__)
# 请求超时设置
TIMEOUT = 10
# 翻译缓存,避免重复翻译
_translate_cache = {}
async def translate_to_zh(text: str) -> str:
"""用 Google Translate 免费接口将英文翻译为中文"""
if not text:
return text
# 检测是否主要是中文,是则跳过
zh_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
if zh_count > len(text) * 0.3:
return text
# 查缓存
cache_key = text[:100]
if cache_key in _translate_cache:
return _translate_cache[cache_key]
try:
async with httpx.AsyncClient(timeout=8) as client:
resp = await client.get(
"https://translate.googleapis.com/translate_a/single",
params={"client": "gtx", "sl": "en", "tl": "zh-CN", "dt": "t", "q": text[:500]},
)
result = resp.json()
translated = "".join(seg[0] for seg in result[0] if seg[0])
_translate_cache[cache_key] = translated
# 限制缓存大小
if len(_translate_cache) > 500:
keys = list(_translate_cache.keys())[:200]
for k in keys:
del _translate_cache[k]
return translated
except Exception as e:
logger.error(f"翻译失败: {e}")
return text
def _make_id(source: str, title: str) -> str:
"""生成新闻唯一 ID"""
raw = f"{source}:{title}"
return hashlib.md5(raw.encode()).hexdigest()[:16]
def _safe_ts(val, default=0) -> int:
"""安全转换时间戳"""
try:
if isinstance(val, (int, float)):
# 如果是毫秒级时间戳,转为秒
return int(val) if val < 2000000000 else int(val / 1000)
if isinstance(val, str):
# 尝试解析常见格式
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"]:
try:
return int(datetime.strptime(val[:19], fmt[:len(val)+2]).timestamp())
except ValueError:
continue
except Exception:
pass
return default or int(time.time())
async def fetch_jin10() -> list:
"""抓取金十数据快讯"""
url = "https://flash-api.jin10.com/get_flash_list"
params = {"channel": "-8200", "vip": "1", "max_time": "", "t": "1"}
headers = {"x-app-id": "bVBF4FyRTn5NJF5n", "x-version": "1.0.0"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params, headers=headers)
data = resp.json().get("data", [])
for item in data:
# 金十的 data 字段可能是 dict 或直接文本
content = ""
if isinstance(item.get("data"), dict):
content = item["data"].get("content", "") or item["data"].get("title", "")
elif isinstance(item.get("data"), str):
content = item["data"]
# 备用:用 content 字段
if not content:
content = item.get("content", "")
if not content:
continue
# 清理 HTML 标签
import re
content = re.sub(r"<[^>]+>", "", content).strip()
if not content:
continue
ts = _safe_ts(item.get("time", ""))
news_id = item.get("id", "")
news_url = f"https://www.jin10.com/flash_detail/{news_id}.html" if news_id else ""
results.append({
"id": _make_id("jin10", content[:80]),
"source": "jin10",
"source_name": "金十数据",
"title": content[:200],
"url": news_url,
"timestamp": ts,
"time_str": item.get("time", ""),
"important": item.get("important", 0) == 1,
})
except Exception as e:
logger.error(f"金十数据抓取失败: {e}")
return results
async def fetch_wallstreet() -> list:
"""抓取华尔街见闻快讯"""
url = "https://api-one.wallstcn.com/apiv1/content/lives"
params = {"channel": "global-channel", "limit": "20"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params)
items = resp.json().get("data", {}).get("items", [])
for item in items:
title = item.get("content_text", "") or item.get("title", "")
if not title:
continue
import re
title = re.sub(r"<[^>]+>", "", title).strip()
if not title:
continue
ts = _safe_ts(item.get("display_time", 0))
news_url = item.get("uri", "") or ""
results.append({
"id": _make_id("wallstreet", title[:80]),
"source": "wallstreet",
"source_name": "华尔街见闻",
"title": title[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": item.get("is_important", False),
})
except Exception as e:
logger.error(f"华尔街见闻抓取失败: {e}")
return results
async def fetch_kr36() -> list:
"""抓取36氪快讯"""
url = "https://36kr.com/api/newsflash"
params = {"per_page": "20"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params)
items = resp.json().get("data", {}).get("items", [])
for item in items:
title = item.get("title", "") or item.get("entity_name", "")
if not title:
continue
ts = _safe_ts(item.get("published_at", ""))
news_id = item.get("id", "")
news_url = f"https://36kr.com/newsflashes/{news_id}" if news_id else ""
results.append({
"id": _make_id("kr36", title[:80]),
"source": "kr36",
"source_name": "36氪",
"title": title[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"36氪抓取失败: {e}")
return results
async def fetch_sina() -> list:
"""抓取新浪财经快讯"""
url = "https://feed.mix.sina.com.cn/api/roll/get"
params = {"pageid": "153", "lid": "2516", "k": "", "num": "20", "page": "1"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params)
data = resp.json().get("result", {}).get("data", [])
for item in data:
title = item.get("title", "")
if not title:
continue
import re
title = re.sub(r"<[^>]+>", "", title).strip()
if not title:
continue
ts = _safe_ts(item.get("ctime", item.get("createtime", 0)))
news_url = item.get("url", "") or item.get("link", "") or ""
results.append({
"id": _make_id("sina", title[:80]),
"source": "sina",
"source_name": "新浪财经",
"title": title[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"新浪财经抓取失败: {e}")
return results
async def fetch_google_news() -> list:
"""抓取 Google News 科技频道 RSS聚合路透社/彭博社等)"""
url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB"
params = {"hl": "en-US", "gl": "US", "ceid": "US:en"}
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, params=params, headers={"User-Agent": "Mozilla/5.0"})
root = ET.fromstring(resp.text)
for item in root.findall(".//item")[:20]:
title = item.findtext("title", "").strip()
if not title:
continue
# 去掉来源后缀 " - Bloomberg.com" 等
source_tag = title.rsplit(" - ", 1)[-1] if " - " in title else ""
news_url = item.findtext("link", "").strip()
ts = 0
pub = item.findtext("pubDate", "")
if pub:
try:
ts = int(parsedate_to_datetime(pub).timestamp())
except Exception:
ts = int(time.time())
title_zh = await translate_to_zh(title)
results.append({
"id": _make_id("google", title[:80]),
"source": "google",
"source_name": f"Google News ({source_tag})" if source_tag else "Google News",
"title": title_zh[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"Google News 抓取失败: {e}")
return results
async def fetch_finviz() -> list:
"""抓取 Finviz 美股财经新闻"""
url = "https://finviz.com/news.ashx"
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
# 解析 HTML 中的新闻标题和链接
matches = re.findall(r'class="nn-tab-link"[^>]*href="([^"]*)"[^>]*>([^<]+)', resp.text)
if not matches:
# 备用:只提取标题
titles = re.findall(r'class="nn-tab-link"[^>]*>([^<]+)', resp.text)
matches = [("", t) for t in titles]
for link, title in matches[:20]:
title = title.strip()
if not title:
continue
title_zh = await translate_to_zh(title)
results.append({
"id": _make_id("finviz", title[:80]),
"source": "finviz",
"source_name": "Finviz",
"title": title_zh[:200],
"url": link or "",
"timestamp": int(time.time()),
"time_str": datetime.now().strftime("%H:%M:%S"),
"important": False,
})
except Exception as e:
logger.error(f"Finviz 抓取失败: {e}")
return results
async def fetch_techcrunch() -> list:
"""抓取 TechCrunch RSS 科技新闻"""
url = "https://techcrunch.com/feed/"
results = []
try:
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
root = ET.fromstring(resp.text)
for item in root.findall(".//item")[:20]:
title = item.findtext("title", "").strip()
if not title:
continue
news_url = item.findtext("link", "").strip()
ts = 0
pub = item.findtext("pubDate", "")
if pub:
try:
ts = int(parsedate_to_datetime(pub).timestamp())
except Exception:
ts = int(time.time())
title_zh = await translate_to_zh(title)
results.append({
"id": _make_id("techcrunch", title[:80]),
"source": "techcrunch",
"source_name": "TechCrunch",
"title": title_zh[:200],
"url": news_url,
"timestamp": ts,
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
"important": False,
})
except Exception as e:
logger.error(f"TechCrunch 抓取失败: {e}")
return results
# 源名称 → 抓取函数映射
SOURCE_FETCHERS = {
"jin10": fetch_jin10,
"wallstreet": fetch_wallstreet,
"kr36": fetch_kr36,
"sina": fetch_sina,
"google": fetch_google_news,
"finviz": fetch_finviz,
"techcrunch": fetch_techcrunch,
}
SOURCE_NAMES = {
"jin10": "金十数据",
"wallstreet": "华尔街见闻",
"kr36": "36氪",
"sina": "新浪财经",
"google": "Google News",
"finviz": "Finviz",
"techcrunch": "TechCrunch",
}
async def fetch_all(enabled_sources: dict = None) -> list:
"""抓取所有启用的信息源,返回合并后的新闻列表"""
import asyncio
if enabled_sources is None:
enabled_sources = {k: True for k in SOURCE_FETCHERS}
tasks = []
for name, fetcher in SOURCE_FETCHERS.items():
if enabled_sources.get(name, True):
tasks.append(fetcher())
all_news = []
results = await asyncio.gather(*tasks, return_exceptions=True)
for result in results:
if isinstance(result, Exception):
logger.error(f"抓取异常: {result}")
continue
if isinstance(result, list):
all_news.extend(result)
# 按时间排序,最新的在前
all_news.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
return all_news