""" 信息源抓取模块 - 支持金十、华尔街见闻、36氪、新浪财经、Google News、Finviz、TechCrunch """ import re import time import hashlib import logging import httpx import xml.etree.ElementTree as ET from datetime import datetime from email.utils import parsedate_to_datetime logger = logging.getLogger(__name__) # 请求超时设置 TIMEOUT = 10 # 翻译缓存,避免重复翻译 _translate_cache = {} async def translate_to_zh(text: str) -> str: """用 Google Translate 免费接口将英文翻译为中文""" if not text: return text # 检测是否主要是中文,是则跳过 zh_count = sum(1 for c in text if '\u4e00' <= c <= '\u9fff') if zh_count > len(text) * 0.3: return text # 查缓存 cache_key = text[:100] if cache_key in _translate_cache: return _translate_cache[cache_key] try: async with httpx.AsyncClient(timeout=8) as client: resp = await client.get( "https://translate.googleapis.com/translate_a/single", params={"client": "gtx", "sl": "en", "tl": "zh-CN", "dt": "t", "q": text[:500]}, ) result = resp.json() translated = "".join(seg[0] for seg in result[0] if seg[0]) _translate_cache[cache_key] = translated # 限制缓存大小 if len(_translate_cache) > 500: keys = list(_translate_cache.keys())[:200] for k in keys: del _translate_cache[k] return translated except Exception as e: logger.error(f"翻译失败: {e}") return text def _make_id(source: str, title: str) -> str: """生成新闻唯一 ID""" raw = f"{source}:{title}" return hashlib.md5(raw.encode()).hexdigest()[:16] def _safe_ts(val, default=0) -> int: """安全转换时间戳""" try: if isinstance(val, (int, float)): # 如果是毫秒级时间戳,转为秒 return int(val) if val < 2000000000 else int(val / 1000) if isinstance(val, str): # 尝试解析常见格式 for fmt in ["%Y-%m-%d %H:%M:%S", "%Y-%m-%dT%H:%M:%S", "%Y-%m-%dT%H:%M:%S.%f"]: try: return int(datetime.strptime(val[:19], fmt[:len(val)+2]).timestamp()) except ValueError: continue except Exception: pass return default or int(time.time()) async def fetch_jin10() -> list: """抓取金十数据快讯""" url = "https://flash-api.jin10.com/get_flash_list" params = {"channel": "-8200", "vip": "1", "max_time": "", "t": "1"} headers = {"x-app-id": "bVBF4FyRTn5NJF5n", "x-version": "1.0.0"} results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, params=params, headers=headers) data = resp.json().get("data", []) for item in data: # 金十的 data 字段可能是 dict 或直接文本 content = "" if isinstance(item.get("data"), dict): content = item["data"].get("content", "") or item["data"].get("title", "") elif isinstance(item.get("data"), str): content = item["data"] # 备用:用 content 字段 if not content: content = item.get("content", "") if not content: continue # 清理 HTML 标签 import re content = re.sub(r"<[^>]+>", "", content).strip() if not content: continue ts = _safe_ts(item.get("time", "")) news_id = item.get("id", "") news_url = f"https://www.jin10.com/flash_detail/{news_id}.html" if news_id else "" results.append({ "id": _make_id("jin10", content[:80]), "source": "jin10", "source_name": "金十数据", "title": content[:200], "url": news_url, "timestamp": ts, "time_str": item.get("time", ""), "important": item.get("important", 0) == 1, }) except Exception as e: logger.error(f"金十数据抓取失败: {e}") return results async def fetch_wallstreet() -> list: """抓取华尔街见闻快讯""" url = "https://api-one.wallstcn.com/apiv1/content/lives" params = {"channel": "global-channel", "limit": "20"} results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, params=params) items = resp.json().get("data", {}).get("items", []) for item in items: title = item.get("content_text", "") or item.get("title", "") if not title: continue import re title = re.sub(r"<[^>]+>", "", title).strip() if not title: continue ts = _safe_ts(item.get("display_time", 0)) news_url = item.get("uri", "") or "" results.append({ "id": _make_id("wallstreet", title[:80]), "source": "wallstreet", "source_name": "华尔街见闻", "title": title[:200], "url": news_url, "timestamp": ts, "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "", "important": item.get("is_important", False), }) except Exception as e: logger.error(f"华尔街见闻抓取失败: {e}") return results async def fetch_kr36() -> list: """抓取36氪快讯""" url = "https://36kr.com/api/newsflash" params = {"per_page": "20"} results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, params=params) items = resp.json().get("data", {}).get("items", []) for item in items: title = item.get("title", "") or item.get("entity_name", "") if not title: continue ts = _safe_ts(item.get("published_at", "")) news_id = item.get("id", "") news_url = f"https://36kr.com/newsflashes/{news_id}" if news_id else "" results.append({ "id": _make_id("kr36", title[:80]), "source": "kr36", "source_name": "36氪", "title": title[:200], "url": news_url, "timestamp": ts, "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "", "important": False, }) except Exception as e: logger.error(f"36氪抓取失败: {e}") return results async def fetch_sina() -> list: """抓取新浪财经快讯""" url = "https://feed.mix.sina.com.cn/api/roll/get" params = {"pageid": "153", "lid": "2516", "k": "", "num": "20", "page": "1"} results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, params=params) data = resp.json().get("result", {}).get("data", []) for item in data: title = item.get("title", "") if not title: continue import re title = re.sub(r"<[^>]+>", "", title).strip() if not title: continue ts = _safe_ts(item.get("ctime", item.get("createtime", 0))) news_url = item.get("url", "") or item.get("link", "") or "" results.append({ "id": _make_id("sina", title[:80]), "source": "sina", "source_name": "新浪财经", "title": title[:200], "url": news_url, "timestamp": ts, "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "", "important": False, }) except Exception as e: logger.error(f"新浪财经抓取失败: {e}") return results async def fetch_google_news() -> list: """抓取 Google News 科技频道 RSS(聚合路透社/彭博社等)""" url = "https://news.google.com/rss/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGRqTVhZU0FtVnVHZ0pWVXlnQVAB" params = {"hl": "en-US", "gl": "US", "ceid": "US:en"} results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, params=params, headers={"User-Agent": "Mozilla/5.0"}) root = ET.fromstring(resp.text) for item in root.findall(".//item")[:20]: title = item.findtext("title", "").strip() if not title: continue # 去掉来源后缀 " - Bloomberg.com" 等 source_tag = title.rsplit(" - ", 1)[-1] if " - " in title else "" news_url = item.findtext("link", "").strip() ts = 0 pub = item.findtext("pubDate", "") if pub: try: ts = int(parsedate_to_datetime(pub).timestamp()) except Exception: ts = int(time.time()) title_zh = await translate_to_zh(title) results.append({ "id": _make_id("google", title[:80]), "source": "google", "source_name": f"Google News ({source_tag})" if source_tag else "Google News", "title": title_zh[:200], "url": news_url, "timestamp": ts, "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "", "important": False, }) except Exception as e: logger.error(f"Google News 抓取失败: {e}") return results async def fetch_finviz() -> list: """抓取 Finviz 美股财经新闻""" url = "https://finviz.com/news.ashx" results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"}) # 解析 HTML 中的新闻标题和链接 matches = re.findall(r'class="nn-tab-link"[^>]*href="([^"]*)"[^>]*>([^<]+)', resp.text) if not matches: # 备用:只提取标题 titles = re.findall(r'class="nn-tab-link"[^>]*>([^<]+)', resp.text) matches = [("", t) for t in titles] for link, title in matches[:20]: title = title.strip() if not title: continue title_zh = await translate_to_zh(title) results.append({ "id": _make_id("finviz", title[:80]), "source": "finviz", "source_name": "Finviz", "title": title_zh[:200], "url": link or "", "timestamp": int(time.time()), "time_str": datetime.now().strftime("%H:%M:%S"), "important": False, }) except Exception as e: logger.error(f"Finviz 抓取失败: {e}") return results async def fetch_techcrunch() -> list: """抓取 TechCrunch RSS 科技新闻""" url = "https://techcrunch.com/feed/" results = [] try: async with httpx.AsyncClient(timeout=TIMEOUT) as client: resp = await client.get(url, headers={"User-Agent": "Mozilla/5.0"}) root = ET.fromstring(resp.text) for item in root.findall(".//item")[:20]: title = item.findtext("title", "").strip() if not title: continue news_url = item.findtext("link", "").strip() ts = 0 pub = item.findtext("pubDate", "") if pub: try: ts = int(parsedate_to_datetime(pub).timestamp()) except Exception: ts = int(time.time()) title_zh = await translate_to_zh(title) results.append({ "id": _make_id("techcrunch", title[:80]), "source": "techcrunch", "source_name": "TechCrunch", "title": title_zh[:200], "url": news_url, "timestamp": ts, "time_str": datetime.fromtimestamp(ts).strftime("%H:%M:%S") if ts else "", "important": False, }) except Exception as e: logger.error(f"TechCrunch 抓取失败: {e}") return results # 源名称 → 抓取函数映射 SOURCE_FETCHERS = { "jin10": fetch_jin10, "wallstreet": fetch_wallstreet, "kr36": fetch_kr36, "sina": fetch_sina, "google": fetch_google_news, "finviz": fetch_finviz, "techcrunch": fetch_techcrunch, } SOURCE_NAMES = { "jin10": "金十数据", "wallstreet": "华尔街见闻", "kr36": "36氪", "sina": "新浪财经", "google": "Google News", "finviz": "Finviz", "techcrunch": "TechCrunch", } async def fetch_all(enabled_sources: dict = None) -> list: """抓取所有启用的信息源,返回合并后的新闻列表""" import asyncio if enabled_sources is None: enabled_sources = {k: True for k in SOURCE_FETCHERS} tasks = [] for name, fetcher in SOURCE_FETCHERS.items(): if enabled_sources.get(name, True): tasks.append(fetcher()) all_news = [] results = await asyncio.gather(*tasks, return_exceptions=True) for result in results: if isinstance(result, Exception): logger.error(f"抓取异常: {result}") continue if isinstance(result, list): all_news.extend(result) # 按时间排序,最新的在前 all_news.sort(key=lambda x: x.get("timestamp", 0), reverse=True) return all_news