Rename to hkt.sh
This commit is contained in:
374
projects/news-bot/sources.py
Normal file
374
projects/news-bot/sources.py
Normal file
@@ -0,0 +1,374 @@
|
||||
"""
|
||||
信息源抓取模块 - 支持金十、华尔街见闻、36氪、新浪财经、Google News、Finviz、TechCrunch
|
||||
"""
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import logging
|
||||
import httpx
|
||||
import xml.etree.ElementTree as ET
|
||||
from datetime import datetime
|
||||
from email.utils import parsedate_to_datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# 请求超时设置
|
||||
TIMEOUT = 10
|
||||
|
||||
# 翻译缓存,避免重复翻译
|
||||
_translate_cache = {}
|
||||
|
||||
|
||||
async def translate_to_zh(text: str) -> str:
|
||||
"""用 Google Translate 免费接口将英文翻译为中文"""
|
||||
if not text:
|
||||
return text
|
||||
# 检测是否主要是中文,是则跳过
|
||||
zh_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
|
||||
if zh_count > len(text) * 0.3:
|
||||
return text
|
||||
# 查缓存
|
||||
cache_key = text[:100]
|
||||
if cache_key in _translate_cache:
|
||||
return _translate_cache[cache_key]
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=8) as client:
|
||||
resp = await client.get(
|
||||
"https://translate.googleapis.com/translate_a/single",
|
||||
params={"client": "gtx", "sl": "en", "tl": "zh-CN", "dt": "t", "q": text[:500]},
|
||||
)
|
||||
result = resp.json()
|
||||
translated = "".join(seg[0] for seg in result[0] if seg[0])
|
||||
_translate_cache[cache_key] = translated
|
||||
# 限制缓存大小
|
||||
if len(_translate_cache) > 500:
|
||||
keys = list(_translate_cache.keys())[:200]
|
||||
for k in keys:
|
||||
del _translate_cache[k]
|
||||
return translated
|
||||
except Exception as e:
|
||||
logger.error(f"翻译失败: {e}")
|
||||
return text
|
||||
|
||||
|
||||
def _make_id(source: str, title: str) -> str:
|
||||
"""生成新闻唯一 ID"""
|
||||
raw = f"{source}:{title}"
|
||||
return hashlib.md5(raw.encode()).hexdigest()[:16]
|
||||
|
||||
|
||||
def _safe_ts(val, default=0) -> int:
|
||||
"""安全转换时间戳"""
|
||||
try:
|
||||
if isinstance(val, (int, float)):
|
||||
# 如果是毫秒级时间戳,转为秒
|
||||
return int(val) if val < 2000000000 else int(val / 1000)
|
||||
if isinstance(val, str):
|
||||
# 尝试解析常见格式
|
||||
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"]:
|
||||
try:
|
||||
return int(datetime.strptime(val[:19], fmt[:len(val)+2]).timestamp())
|
||||
except ValueError:
|
||||
continue
|
||||
except Exception:
|
||||
pass
|
||||
return default or int(time.time())
|
||||
|
||||
|
||||
async def fetch_jin10() -> list:
|
||||
"""抓取金十数据快讯"""
|
||||
url = "https://flash-api.jin10.com/get_flash_list"
|
||||
params = {"channel": "-8200", "vip": "1", "max_time": "", "t": "1"}
|
||||
headers = {"x-app-id": "bVBF4FyRTn5NJF5n", "x-version": "1.0.0"}
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, params=params, headers=headers)
|
||||
data = resp.json().get("data", [])
|
||||
for item in data:
|
||||
# 金十的 data 字段可能是 dict 或直接文本
|
||||
content = ""
|
||||
if isinstance(item.get("data"), dict):
|
||||
content = item["data"].get("content", "") or item["data"].get("title", "")
|
||||
elif isinstance(item.get("data"), str):
|
||||
content = item["data"]
|
||||
# 备用:用 content 字段
|
||||
if not content:
|
||||
content = item.get("content", "")
|
||||
if not content:
|
||||
continue
|
||||
# 清理 HTML 标签
|
||||
import re
|
||||
content = re.sub(r"<[^>]+>", "", content).strip()
|
||||
if not content:
|
||||
continue
|
||||
ts = _safe_ts(item.get("time", ""))
|
||||
news_id = item.get("id", "")
|
||||
news_url = f"https://www.jin10.com/flash_detail/{news_id}.html" if news_id else ""
|
||||
results.append({
|
||||
"id": _make_id("jin10", content[:80]),
|
||||
"source": "jin10",
|
||||
"source_name": "金十数据",
|
||||
"title": content[:200],
|
||||
"url": news_url,
|
||||
"timestamp": ts,
|
||||
"time_str": item.get("time", ""),
|
||||
"important": item.get("important", 0) == 1,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"金十数据抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
async def fetch_wallstreet() -> list:
|
||||
"""抓取华尔街见闻快讯"""
|
||||
url = "https://api-one.wallstcn.com/apiv1/content/lives"
|
||||
params = {"channel": "global-channel", "limit": "20"}
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, params=params)
|
||||
items = resp.json().get("data", {}).get("items", [])
|
||||
for item in items:
|
||||
title = item.get("content_text", "") or item.get("title", "")
|
||||
if not title:
|
||||
continue
|
||||
import re
|
||||
title = re.sub(r"<[^>]+>", "", title).strip()
|
||||
if not title:
|
||||
continue
|
||||
ts = _safe_ts(item.get("display_time", 0))
|
||||
news_url = item.get("uri", "") or ""
|
||||
results.append({
|
||||
"id": _make_id("wallstreet", title[:80]),
|
||||
"source": "wallstreet",
|
||||
"source_name": "华尔街见闻",
|
||||
"title": title[:200],
|
||||
"url": news_url,
|
||||
"timestamp": ts,
|
||||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||||
"important": item.get("is_important", False),
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"华尔街见闻抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
async def fetch_kr36() -> list:
|
||||
"""抓取36氪快讯"""
|
||||
url = "https://36kr.com/api/newsflash"
|
||||
params = {"per_page": "20"}
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, params=params)
|
||||
items = resp.json().get("data", {}).get("items", [])
|
||||
for item in items:
|
||||
title = item.get("title", "") or item.get("entity_name", "")
|
||||
if not title:
|
||||
continue
|
||||
ts = _safe_ts(item.get("published_at", ""))
|
||||
news_id = item.get("id", "")
|
||||
news_url = f"https://36kr.com/newsflashes/{news_id}" if news_id else ""
|
||||
results.append({
|
||||
"id": _make_id("kr36", title[:80]),
|
||||
"source": "kr36",
|
||||
"source_name": "36氪",
|
||||
"title": title[:200],
|
||||
"url": news_url,
|
||||
"timestamp": ts,
|
||||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||||
"important": False,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"36氪抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
async def fetch_sina() -> list:
|
||||
"""抓取新浪财经快讯"""
|
||||
url = "https://feed.mix.sina.com.cn/api/roll/get"
|
||||
params = {"pageid": "153", "lid": "2516", "k": "", "num": "20", "page": "1"}
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, params=params)
|
||||
data = resp.json().get("result", {}).get("data", [])
|
||||
for item in data:
|
||||
title = item.get("title", "")
|
||||
if not title:
|
||||
continue
|
||||
import re
|
||||
title = re.sub(r"<[^>]+>", "", title).strip()
|
||||
if not title:
|
||||
continue
|
||||
ts = _safe_ts(item.get("ctime", item.get("createtime", 0)))
|
||||
news_url = item.get("url", "") or item.get("link", "") or ""
|
||||
results.append({
|
||||
"id": _make_id("sina", title[:80]),
|
||||
"source": "sina",
|
||||
"source_name": "新浪财经",
|
||||
"title": title[:200],
|
||||
"url": news_url,
|
||||
"timestamp": ts,
|
||||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||||
"important": False,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"新浪财经抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
async def fetch_google_news() -> list:
|
||||
"""抓取 Google News 科技频道 RSS(聚合路透社/彭博社等)"""
|
||||
url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB"
|
||||
params = {"hl": "en-US", "gl": "US", "ceid": "US:en"}
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, params=params, headers={"User-Agent": "Mozilla/5.0"})
|
||||
root = ET.fromstring(resp.text)
|
||||
for item in root.findall(".//item")[:20]:
|
||||
title = item.findtext("title", "").strip()
|
||||
if not title:
|
||||
continue
|
||||
# 去掉来源后缀 " - Bloomberg.com" 等
|
||||
source_tag = title.rsplit(" - ", 1)[-1] if " - " in title else ""
|
||||
news_url = item.findtext("link", "").strip()
|
||||
ts = 0
|
||||
pub = item.findtext("pubDate", "")
|
||||
if pub:
|
||||
try:
|
||||
ts = int(parsedate_to_datetime(pub).timestamp())
|
||||
except Exception:
|
||||
ts = int(time.time())
|
||||
title_zh = await translate_to_zh(title)
|
||||
results.append({
|
||||
"id": _make_id("google", title[:80]),
|
||||
"source": "google",
|
||||
"source_name": f"Google News ({source_tag})" if source_tag else "Google News",
|
||||
"title": title_zh[:200],
|
||||
"url": news_url,
|
||||
"timestamp": ts,
|
||||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||||
"important": False,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Google News 抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
async def fetch_finviz() -> list:
|
||||
"""抓取 Finviz 美股财经新闻"""
|
||||
url = "https://finviz.com/news.ashx"
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
# 解析 HTML 中的新闻标题和链接
|
||||
matches = re.findall(r'class="nn-tab-link"[^>]*href="([^"]*)"[^>]*>([^<]+)', resp.text)
|
||||
if not matches:
|
||||
# 备用:只提取标题
|
||||
titles = re.findall(r'class="nn-tab-link"[^>]*>([^<]+)', resp.text)
|
||||
matches = [("", t) for t in titles]
|
||||
for link, title in matches[:20]:
|
||||
title = title.strip()
|
||||
if not title:
|
||||
continue
|
||||
title_zh = await translate_to_zh(title)
|
||||
results.append({
|
||||
"id": _make_id("finviz", title[:80]),
|
||||
"source": "finviz",
|
||||
"source_name": "Finviz",
|
||||
"title": title_zh[:200],
|
||||
"url": link or "",
|
||||
"timestamp": int(time.time()),
|
||||
"time_str": datetime.now().strftime("%H:%M:%S"),
|
||||
"important": False,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"Finviz 抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
async def fetch_techcrunch() -> list:
|
||||
"""抓取 TechCrunch RSS 科技新闻"""
|
||||
url = "https://techcrunch.com/feed/"
|
||||
results = []
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||||
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
||||
root = ET.fromstring(resp.text)
|
||||
for item in root.findall(".//item")[:20]:
|
||||
title = item.findtext("title", "").strip()
|
||||
if not title:
|
||||
continue
|
||||
news_url = item.findtext("link", "").strip()
|
||||
ts = 0
|
||||
pub = item.findtext("pubDate", "")
|
||||
if pub:
|
||||
try:
|
||||
ts = int(parsedate_to_datetime(pub).timestamp())
|
||||
except Exception:
|
||||
ts = int(time.time())
|
||||
title_zh = await translate_to_zh(title)
|
||||
results.append({
|
||||
"id": _make_id("techcrunch", title[:80]),
|
||||
"source": "techcrunch",
|
||||
"source_name": "TechCrunch",
|
||||
"title": title_zh[:200],
|
||||
"url": news_url,
|
||||
"timestamp": ts,
|
||||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||||
"important": False,
|
||||
})
|
||||
except Exception as e:
|
||||
logger.error(f"TechCrunch 抓取失败: {e}")
|
||||
return results
|
||||
|
||||
|
||||
# 源名称 → 抓取函数映射
|
||||
SOURCE_FETCHERS = {
|
||||
"jin10": fetch_jin10,
|
||||
"wallstreet": fetch_wallstreet,
|
||||
"kr36": fetch_kr36,
|
||||
"sina": fetch_sina,
|
||||
"google": fetch_google_news,
|
||||
"finviz": fetch_finviz,
|
||||
"techcrunch": fetch_techcrunch,
|
||||
}
|
||||
|
||||
SOURCE_NAMES = {
|
||||
"jin10": "金十数据",
|
||||
"wallstreet": "华尔街见闻",
|
||||
"kr36": "36氪",
|
||||
"sina": "新浪财经",
|
||||
"google": "Google News",
|
||||
"finviz": "Finviz",
|
||||
"techcrunch": "TechCrunch",
|
||||
}
|
||||
|
||||
|
||||
async def fetch_all(enabled_sources: dict = None) -> list:
|
||||
"""抓取所有启用的信息源,返回合并后的新闻列表"""
|
||||
import asyncio
|
||||
if enabled_sources is None:
|
||||
enabled_sources = {k: True for k in SOURCE_FETCHERS}
|
||||
|
||||
tasks = []
|
||||
for name, fetcher in SOURCE_FETCHERS.items():
|
||||
if enabled_sources.get(name, True):
|
||||
tasks.append(fetcher())
|
||||
|
||||
all_news = []
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
for result in results:
|
||||
if isinstance(result, Exception):
|
||||
logger.error(f"抓取异常: {result}")
|
||||
continue
|
||||
if isinstance(result, list):
|
||||
all_news.extend(result)
|
||||
|
||||
# 按时间排序,最新的在前
|
||||
all_news.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
|
||||
return all_news
|
||||
Reference in New Issue
Block a user