375 lines
14 KiB
Python
375 lines
14 KiB
Python
"""
|
||
信息源抓取模块 - 支持金十、华尔街见闻、36氪、新浪财经、Google News、Finviz、TechCrunch
|
||
"""
|
||
import re
|
||
import time
|
||
import hashlib
|
||
import logging
|
||
import httpx
|
||
import xml.etree.ElementTree as ET
|
||
from datetime import datetime
|
||
from email.utils import parsedate_to_datetime
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# 请求超时设置
|
||
TIMEOUT = 10
|
||
|
||
# 翻译缓存,避免重复翻译
|
||
_translate_cache = {}
|
||
|
||
|
||
async def translate_to_zh(text: str) -> str:
|
||
"""用 Google Translate 免费接口将英文翻译为中文"""
|
||
if not text:
|
||
return text
|
||
# 检测是否主要是中文,是则跳过
|
||
zh_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
|
||
if zh_count > len(text) * 0.3:
|
||
return text
|
||
# 查缓存
|
||
cache_key = text[:100]
|
||
if cache_key in _translate_cache:
|
||
return _translate_cache[cache_key]
|
||
try:
|
||
async with httpx.AsyncClient(timeout=8) as client:
|
||
resp = await client.get(
|
||
"https://translate.googleapis.com/translate_a/single",
|
||
params={"client": "gtx", "sl": "en", "tl": "zh-CN", "dt": "t", "q": text[:500]},
|
||
)
|
||
result = resp.json()
|
||
translated = "".join(seg[0] for seg in result[0] if seg[0])
|
||
_translate_cache[cache_key] = translated
|
||
# 限制缓存大小
|
||
if len(_translate_cache) > 500:
|
||
keys = list(_translate_cache.keys())[:200]
|
||
for k in keys:
|
||
del _translate_cache[k]
|
||
return translated
|
||
except Exception as e:
|
||
logger.error(f"翻译失败: {e}")
|
||
return text
|
||
|
||
|
||
def _make_id(source: str, title: str) -> str:
|
||
"""生成新闻唯一 ID"""
|
||
raw = f"{source}:{title}"
|
||
return hashlib.md5(raw.encode()).hexdigest()[:16]
|
||
|
||
|
||
def _safe_ts(val, default=0) -> int:
|
||
"""安全转换时间戳"""
|
||
try:
|
||
if isinstance(val, (int, float)):
|
||
# 如果是毫秒级时间戳,转为秒
|
||
return int(val) if val < 2000000000 else int(val / 1000)
|
||
if isinstance(val, str):
|
||
# 尝试解析常见格式
|
||
for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"]:
|
||
try:
|
||
return int(datetime.strptime(val[:19], fmt[:len(val)+2]).timestamp())
|
||
except ValueError:
|
||
continue
|
||
except Exception:
|
||
pass
|
||
return default or int(time.time())
|
||
|
||
|
||
async def fetch_jin10() -> list:
|
||
"""抓取金十数据快讯"""
|
||
url = "https://flash-api.jin10.com/get_flash_list"
|
||
params = {"channel": "-8200", "vip": "1", "max_time": "", "t": "1"}
|
||
headers = {"x-app-id": "bVBF4FyRTn5NJF5n", "x-version": "1.0.0"}
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, params=params, headers=headers)
|
||
data = resp.json().get("data", [])
|
||
for item in data:
|
||
# 金十的 data 字段可能是 dict 或直接文本
|
||
content = ""
|
||
if isinstance(item.get("data"), dict):
|
||
content = item["data"].get("content", "") or item["data"].get("title", "")
|
||
elif isinstance(item.get("data"), str):
|
||
content = item["data"]
|
||
# 备用:用 content 字段
|
||
if not content:
|
||
content = item.get("content", "")
|
||
if not content:
|
||
continue
|
||
# 清理 HTML 标签
|
||
import re
|
||
content = re.sub(r"<[^>]+>", "", content).strip()
|
||
if not content:
|
||
continue
|
||
ts = _safe_ts(item.get("time", ""))
|
||
news_id = item.get("id", "")
|
||
news_url = f"https://www.jin10.com/flash_detail/{news_id}.html" if news_id else ""
|
||
results.append({
|
||
"id": _make_id("jin10", content[:80]),
|
||
"source": "jin10",
|
||
"source_name": "金十数据",
|
||
"title": content[:200],
|
||
"url": news_url,
|
||
"timestamp": ts,
|
||
"time_str": item.get("time", ""),
|
||
"important": item.get("important", 0) == 1,
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"金十数据抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
async def fetch_wallstreet() -> list:
|
||
"""抓取华尔街见闻快讯"""
|
||
url = "https://api-one.wallstcn.com/apiv1/content/lives"
|
||
params = {"channel": "global-channel", "limit": "20"}
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, params=params)
|
||
items = resp.json().get("data", {}).get("items", [])
|
||
for item in items:
|
||
title = item.get("content_text", "") or item.get("title", "")
|
||
if not title:
|
||
continue
|
||
import re
|
||
title = re.sub(r"<[^>]+>", "", title).strip()
|
||
if not title:
|
||
continue
|
||
ts = _safe_ts(item.get("display_time", 0))
|
||
news_url = item.get("uri", "") or ""
|
||
results.append({
|
||
"id": _make_id("wallstreet", title[:80]),
|
||
"source": "wallstreet",
|
||
"source_name": "华尔街见闻",
|
||
"title": title[:200],
|
||
"url": news_url,
|
||
"timestamp": ts,
|
||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||
"important": item.get("is_important", False),
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"华尔街见闻抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
async def fetch_kr36() -> list:
|
||
"""抓取36氪快讯"""
|
||
url = "https://36kr.com/api/newsflash"
|
||
params = {"per_page": "20"}
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, params=params)
|
||
items = resp.json().get("data", {}).get("items", [])
|
||
for item in items:
|
||
title = item.get("title", "") or item.get("entity_name", "")
|
||
if not title:
|
||
continue
|
||
ts = _safe_ts(item.get("published_at", ""))
|
||
news_id = item.get("id", "")
|
||
news_url = f"https://36kr.com/newsflashes/{news_id}" if news_id else ""
|
||
results.append({
|
||
"id": _make_id("kr36", title[:80]),
|
||
"source": "kr36",
|
||
"source_name": "36氪",
|
||
"title": title[:200],
|
||
"url": news_url,
|
||
"timestamp": ts,
|
||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||
"important": False,
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"36氪抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
async def fetch_sina() -> list:
|
||
"""抓取新浪财经快讯"""
|
||
url = "https://feed.mix.sina.com.cn/api/roll/get"
|
||
params = {"pageid": "153", "lid": "2516", "k": "", "num": "20", "page": "1"}
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, params=params)
|
||
data = resp.json().get("result", {}).get("data", [])
|
||
for item in data:
|
||
title = item.get("title", "")
|
||
if not title:
|
||
continue
|
||
import re
|
||
title = re.sub(r"<[^>]+>", "", title).strip()
|
||
if not title:
|
||
continue
|
||
ts = _safe_ts(item.get("ctime", item.get("createtime", 0)))
|
||
news_url = item.get("url", "") or item.get("link", "") or ""
|
||
results.append({
|
||
"id": _make_id("sina", title[:80]),
|
||
"source": "sina",
|
||
"source_name": "新浪财经",
|
||
"title": title[:200],
|
||
"url": news_url,
|
||
"timestamp": ts,
|
||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||
"important": False,
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"新浪财经抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
async def fetch_google_news() -> list:
|
||
"""抓取 Google News 科技频道 RSS(聚合路透社/彭博社等)"""
|
||
url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB"
|
||
params = {"hl": "en-US", "gl": "US", "ceid": "US:en"}
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, params=params, headers={"User-Agent": "Mozilla/5.0"})
|
||
root = ET.fromstring(resp.text)
|
||
for item in root.findall(".//item")[:20]:
|
||
title = item.findtext("title", "").strip()
|
||
if not title:
|
||
continue
|
||
# 去掉来源后缀 " - Bloomberg.com" 等
|
||
source_tag = title.rsplit(" - ", 1)[-1] if " - " in title else ""
|
||
news_url = item.findtext("link", "").strip()
|
||
ts = 0
|
||
pub = item.findtext("pubDate", "")
|
||
if pub:
|
||
try:
|
||
ts = int(parsedate_to_datetime(pub).timestamp())
|
||
except Exception:
|
||
ts = int(time.time())
|
||
title_zh = await translate_to_zh(title)
|
||
results.append({
|
||
"id": _make_id("google", title[:80]),
|
||
"source": "google",
|
||
"source_name": f"Google News ({source_tag})" if source_tag else "Google News",
|
||
"title": title_zh[:200],
|
||
"url": news_url,
|
||
"timestamp": ts,
|
||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||
"important": False,
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"Google News 抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
async def fetch_finviz() -> list:
|
||
"""抓取 Finviz 美股财经新闻"""
|
||
url = "https://finviz.com/news.ashx"
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
||
# 解析 HTML 中的新闻标题和链接
|
||
matches = re.findall(r'class="nn-tab-link"[^>]*href="([^"]*)"[^>]*>([^<]+)', resp.text)
|
||
if not matches:
|
||
# 备用:只提取标题
|
||
titles = re.findall(r'class="nn-tab-link"[^>]*>([^<]+)', resp.text)
|
||
matches = [("", t) for t in titles]
|
||
for link, title in matches[:20]:
|
||
title = title.strip()
|
||
if not title:
|
||
continue
|
||
title_zh = await translate_to_zh(title)
|
||
results.append({
|
||
"id": _make_id("finviz", title[:80]),
|
||
"source": "finviz",
|
||
"source_name": "Finviz",
|
||
"title": title_zh[:200],
|
||
"url": link or "",
|
||
"timestamp": int(time.time()),
|
||
"time_str": datetime.now().strftime("%H:%M:%S"),
|
||
"important": False,
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"Finviz 抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
async def fetch_techcrunch() -> list:
|
||
"""抓取 TechCrunch RSS 科技新闻"""
|
||
url = "https://techcrunch.com/feed/"
|
||
results = []
|
||
try:
|
||
async with httpx.AsyncClient(timeout=TIMEOUT) as client:
|
||
resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
||
root = ET.fromstring(resp.text)
|
||
for item in root.findall(".//item")[:20]:
|
||
title = item.findtext("title", "").strip()
|
||
if not title:
|
||
continue
|
||
news_url = item.findtext("link", "").strip()
|
||
ts = 0
|
||
pub = item.findtext("pubDate", "")
|
||
if pub:
|
||
try:
|
||
ts = int(parsedate_to_datetime(pub).timestamp())
|
||
except Exception:
|
||
ts = int(time.time())
|
||
title_zh = await translate_to_zh(title)
|
||
results.append({
|
||
"id": _make_id("techcrunch", title[:80]),
|
||
"source": "techcrunch",
|
||
"source_name": "TechCrunch",
|
||
"title": title_zh[:200],
|
||
"url": news_url,
|
||
"timestamp": ts,
|
||
"time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
|
||
"important": False,
|
||
})
|
||
except Exception as e:
|
||
logger.error(f"TechCrunch 抓取失败: {e}")
|
||
return results
|
||
|
||
|
||
# 源名称 → 抓取函数映射
|
||
SOURCE_FETCHERS = {
|
||
"jin10": fetch_jin10,
|
||
"wallstreet": fetch_wallstreet,
|
||
"kr36": fetch_kr36,
|
||
"sina": fetch_sina,
|
||
"google": fetch_google_news,
|
||
"finviz": fetch_finviz,
|
||
"techcrunch": fetch_techcrunch,
|
||
}
|
||
|
||
SOURCE_NAMES = {
|
||
"jin10": "金十数据",
|
||
"wallstreet": "华尔街见闻",
|
||
"kr36": "36氪",
|
||
"sina": "新浪财经",
|
||
"google": "Google News",
|
||
"finviz": "Finviz",
|
||
"techcrunch": "TechCrunch",
|
||
}
|
||
|
||
|
||
async def fetch_all(enabled_sources: dict = None) -> list:
|
||
"""抓取所有启用的信息源,返回合并后的新闻列表"""
|
||
import asyncio
|
||
if enabled_sources is None:
|
||
enabled_sources = {k: True for k in SOURCE_FETCHERS}
|
||
|
||
tasks = []
|
||
for name, fetcher in SOURCE_FETCHERS.items():
|
||
if enabled_sources.get(name, True):
|
||
tasks.append(fetcher())
|
||
|
||
all_news = []
|
||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||
for result in results:
|
||
if isinstance(result, Exception):
|
||
logger.error(f"抓取异常: {result}")
|
||
continue
|
||
if isinstance(result, list):
|
||
all_news.extend(result)
|
||
|
||
# 按时间排序,最新的在前
|
||
all_news.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
|
||
return all_news
|