Rename to hkt.sh

2026-03-21 01:10:53 +08:00
parent 76a263d0f9
commit 8f1171fe99
6676 changed files with 1724268 additions and 0 deletions
--- a/projects/news-bot/sources.py
+++ b/projects/news-bot/sources.py
@@ -0,0 +1,374 @@
+"""
+信息源抓取模块 - 支持金十、华尔街见闻、36氪、新浪财经、Google News、Finviz、TechCrunch
+"""
+import re
+import time
+import hashlib
+import logging
+import httpx
+import xml.etree.ElementTree as ET
+from datetime import datetime
+from email.utils import parsedate_to_datetime
+
+logger = logging.getLogger(__name__)
+
+# 请求超时设置
+TIMEOUT = 10
+
+# 翻译缓存，避免重复翻译
+_translate_cache = {}
+
+
+async def translate_to_zh(text: str) -> str:
+    """用 Google Translate 免费接口将英文翻译为中文"""
+    if not text:
+        return text
+    # 检测是否主要是中文，是则跳过
+    zh_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff')
+    if zh_count > len(text) * 0.3:
+        return text
+    # 查缓存
+    cache_key = text[:100]
+    if cache_key in _translate_cache:
+        return _translate_cache[cache_key]
+    try:
+        async with httpx.AsyncClient(timeout=8) as client:
+            resp = await client.get(
+                "https://translate.googleapis.com/translate_a/single",
+                params={"client": "gtx", "sl": "en", "tl": "zh-CN", "dt": "t", "q": text[:500]},
+            )
+            result = resp.json()
+            translated = "".join(seg[0] for seg in result[0] if seg[0])
+            _translate_cache[cache_key] = translated
+            # 限制缓存大小
+            if len(_translate_cache) > 500:
+                keys = list(_translate_cache.keys())[:200]
+                for k in keys:
+                    del _translate_cache[k]
+            return translated
+    except Exception as e:
+        logger.error(f"翻译失败: {e}")
+        return text
+
+
+def _make_id(source: str, title: str) -> str:
+    """生成新闻唯一 ID"""
+    raw = f"{source}:{title}"
+    return hashlib.md5(raw.encode()).hexdigest()[:16]
+
+
+def _safe_ts(val, default=0) -> int:
+    """安全转换时间戳"""
+    try:
+        if isinstance(val, (int, float)):
+            # 如果是毫秒级时间戳，转为秒
+            return int(val) if val < 2000000000 else int(val / 1000)
+        if isinstance(val, str):
+            # 尝试解析常见格式
+            for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"]:
+                try:
+                    return int(datetime.strptime(val[:19], fmt[:len(val)+2]).timestamp())
+                except ValueError:
+                    continue
+    except Exception:
+        pass
+    return default or int(time.time())
+
+
+async def fetch_jin10() -> list:
+    """抓取金十数据快讯"""
+    url = "https://flash-api.jin10.com/get_flash_list"
+    params = {"channel": "-8200", "vip": "1", "max_time": "", "t": "1"}
+    headers = {"x-app-id": "bVBF4FyRTn5NJF5n", "x-version": "1.0.0"}
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, params=params, headers=headers)
+            data = resp.json().get("data", [])
+            for item in data:
+                # 金十的 data 字段可能是 dict 或直接文本
+                content = ""
+                if isinstance(item.get("data"), dict):
+                    content = item["data"].get("content", "") or item["data"].get("title", "")
+                elif isinstance(item.get("data"), str):
+                    content = item["data"]
+                # 备用：用 content 字段
+                if not content:
+                    content = item.get("content", "")
+                if not content:
+                    continue
+                # 清理 HTML 标签
+                import re
+                content = re.sub(r"<[^>]+>", "", content).strip()
+                if not content:
+                    continue
+                ts = _safe_ts(item.get("time", ""))
+                news_id = item.get("id", "")
+                news_url = f"https://www.jin10.com/flash_detail/{news_id}.html" if news_id else ""
+                results.append({
+                    "id": _make_id("jin10", content[:80]),
+                    "source": "jin10",
+                    "source_name": "金十数据",
+                    "title": content[:200],
+                    "url": news_url,
+                    "timestamp": ts,
+                    "time_str": item.get("time", ""),
+                    "important": item.get("important", 0) == 1,
+                })
+    except Exception as e:
+        logger.error(f"金十数据抓取失败: {e}")
+    return results
+
+
+async def fetch_wallstreet() -> list:
+    """抓取华尔街见闻快讯"""
+    url = "https://api-one.wallstcn.com/apiv1/content/lives"
+    params = {"channel": "global-channel", "limit": "20"}
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, params=params)
+            items = resp.json().get("data", {}).get("items", [])
+            for item in items:
+                title = item.get("content_text", "") or item.get("title", "")
+                if not title:
+                    continue
+                import re
+                title = re.sub(r"<[^>]+>", "", title).strip()
+                if not title:
+                    continue
+                ts = _safe_ts(item.get("display_time", 0))
+                news_url = item.get("uri", "") or ""
+                results.append({
+                    "id": _make_id("wallstreet", title[:80]),
+                    "source": "wallstreet",
+                    "source_name": "华尔街见闻",
+                    "title": title[:200],
+                    "url": news_url,
+                    "timestamp": ts,
+                    "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
+                    "important": item.get("is_important", False),
+                })
+    except Exception as e:
+        logger.error(f"华尔街见闻抓取失败: {e}")
+    return results
+
+
+async def fetch_kr36() -> list:
+    """抓取36氪快讯"""
+    url = "https://36kr.com/api/newsflash"
+    params = {"per_page": "20"}
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, params=params)
+            items = resp.json().get("data", {}).get("items", [])
+            for item in items:
+                title = item.get("title", "") or item.get("entity_name", "")
+                if not title:
+                    continue
+                ts = _safe_ts(item.get("published_at", ""))
+                news_id = item.get("id", "")
+                news_url = f"https://36kr.com/newsflashes/{news_id}" if news_id else ""
+                results.append({
+                    "id": _make_id("kr36", title[:80]),
+                    "source": "kr36",
+                    "source_name": "36氪",
+                    "title": title[:200],
+                    "url": news_url,
+                    "timestamp": ts,
+                    "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
+                    "important": False,
+                })
+    except Exception as e:
+        logger.error(f"36氪抓取失败: {e}")
+    return results
+
+
+async def fetch_sina() -> list:
+    """抓取新浪财经快讯"""
+    url = "https://feed.mix.sina.com.cn/api/roll/get"
+    params = {"pageid": "153", "lid": "2516", "k": "", "num": "20", "page": "1"}
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, params=params)
+            data = resp.json().get("result", {}).get("data", [])
+            for item in data:
+                title = item.get("title", "")
+                if not title:
+                    continue
+                import re
+                title = re.sub(r"<[^>]+>", "", title).strip()
+                if not title:
+                    continue
+                ts = _safe_ts(item.get("ctime", item.get("createtime", 0)))
+                news_url = item.get("url", "") or item.get("link", "") or ""
+                results.append({
+                    "id": _make_id("sina", title[:80]),
+                    "source": "sina",
+                    "source_name": "新浪财经",
+                    "title": title[:200],
+                    "url": news_url,
+                    "timestamp": ts,
+                    "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
+                    "important": False,
+                })
+    except Exception as e:
+        logger.error(f"新浪财经抓取失败: {e}")
+    return results
+
+
+async def fetch_google_news() -> list:
+    """抓取 Google News 科技频道 RSS（聚合路透社/彭博社等）"""
+    url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB"
+    params = {"hl": "en-US", "gl": "US", "ceid": "US:en"}
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, params=params, headers={"User-Agent": "Mozilla/5.0"})
+            root = ET.fromstring(resp.text)
+            for item in root.findall(".//item")[:20]:
+                title = item.findtext("title", "").strip()
+                if not title:
+                    continue
+                # 去掉来源后缀 " - Bloomberg.com" 等
+                source_tag = title.rsplit(" - ", 1)[-1] if " - " in title else ""
+                news_url = item.findtext("link", "").strip()
+                ts = 0
+                pub = item.findtext("pubDate", "")
+                if pub:
+                    try:
+                        ts = int(parsedate_to_datetime(pub).timestamp())
+                    except Exception:
+                        ts = int(time.time())
+                title_zh = await translate_to_zh(title)
+                results.append({
+                    "id": _make_id("google", title[:80]),
+                    "source": "google",
+                    "source_name": f"Google News ({source_tag})" if source_tag else "Google News",
+                    "title": title_zh[:200],
+                    "url": news_url,
+                    "timestamp": ts,
+                    "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
+                    "important": False,
+                })
+    except Exception as e:
+        logger.error(f"Google News 抓取失败: {e}")
+    return results
+
+
+async def fetch_finviz() -> list:
+    """抓取 Finviz 美股财经新闻"""
+    url = "https://finviz.com/news.ashx"
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
+            # 解析 HTML 中的新闻标题和链接
+            matches = re.findall(r'class="nn-tab-link"[^>]*href="([^"]*)"[^>]*>([^<]+)', resp.text)
+            if not matches:
+                # 备用：只提取标题
+                titles = re.findall(r'class="nn-tab-link"[^>]*>([^<]+)', resp.text)
+                matches = [("", t) for t in titles]
+            for link, title in matches[:20]:
+                title = title.strip()
+                if not title:
+                    continue
+                title_zh = await translate_to_zh(title)
+                results.append({
+                    "id": _make_id("finviz", title[:80]),
+                    "source": "finviz",
+                    "source_name": "Finviz",
+                    "title": title_zh[:200],
+                    "url": link or "",
+                    "timestamp": int(time.time()),
+                    "time_str": datetime.now().strftime("%H:%M:%S"),
+                    "important": False,
+                })
+    except Exception as e:
+        logger.error(f"Finviz 抓取失败: {e}")
+    return results
+
+
+async def fetch_techcrunch() -> list:
+    """抓取 TechCrunch RSS 科技新闻"""
+    url = "https://techcrunch.com/feed/"
+    results = []
+    try:
+        async with httpx.AsyncClient(timeout=TIMEOUT) as client:
+            resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"})
+            root = ET.fromstring(resp.text)
+            for item in root.findall(".//item")[:20]:
+                title = item.findtext("title", "").strip()
+                if not title:
+                    continue
+                news_url = item.findtext("link", "").strip()
+                ts = 0
+                pub = item.findtext("pubDate", "")
+                if pub:
+                    try:
+                        ts = int(parsedate_to_datetime(pub).timestamp())
+                    except Exception:
+                        ts = int(time.time())
+                title_zh = await translate_to_zh(title)
+                results.append({
+                    "id": _make_id("techcrunch", title[:80]),
+                    "source": "techcrunch",
+                    "source_name": "TechCrunch",
+                    "title": title_zh[:200],
+                    "url": news_url,
+                    "timestamp": ts,
+                    "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "",
+                    "important": False,
+                })
+    except Exception as e:
+        logger.error(f"TechCrunch 抓取失败: {e}")
+    return results
+
+
+# 源名称 → 抓取函数映射
+SOURCE_FETCHERS = {
+    "jin10": fetch_jin10,
+    "wallstreet": fetch_wallstreet,
+    "kr36": fetch_kr36,
+    "sina": fetch_sina,
+    "google": fetch_google_news,
+    "finviz": fetch_finviz,
+    "techcrunch": fetch_techcrunch,
+}
+
+SOURCE_NAMES = {
+    "jin10": "金十数据",
+    "wallstreet": "华尔街见闻",
+    "kr36": "36氪",
+    "sina": "新浪财经",
+    "google": "Google News",
+    "finviz": "Finviz",
+    "techcrunch": "TechCrunch",
+}
+
+
+async def fetch_all(enabled_sources: dict = None) -> list:
+    """抓取所有启用的信息源，返回合并后的新闻列表"""
+    import asyncio
+    if enabled_sources is None:
+        enabled_sources = {k: True for k in SOURCE_FETCHERS}
+
+    tasks = []
+    for name, fetcher in SOURCE_FETCHERS.items():
+        if enabled_sources.get(name, True):
+            tasks.append(fetcher())
+
+    all_news = []
+    results = await asyncio.gather(*tasks, return_exceptions=True)
+    for result in results:
+        if isinstance(result, Exception):
+            logger.error(f"抓取异常: {result}")
+            continue
+        if isinstance(result, list):
+            all_news.extend(result)
+
+    # 按时间排序，最新的在前
+    all_news.sort(key=lambda x: x.get("timestamp", 0), reverse=True)
+    return all_news