i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-09 03:57:43 +08:00 · 2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions
--- a/api/routers/crawler.py
+++ b/api/routers/crawler.py
@@ -26,10 +26,10 @@ router = APIRouter(prefix="/crawler", tags=["crawler"])

@router.post("/start")
 async def start_crawler(request: CrawlerStartRequest):
-    """启动爬虫任务"""
+    """Start crawler task"""
    success = await crawler_manager.start(request)
    if not success:
-        # 处理并发/重复请求：如果进程已经在跑，返回 400 而不是 500
+        # Handle concurrent/duplicate requests: if process is already running, return 400 instead of 500
        if crawler_manager.process and crawler_manager.process.poll() is None:
            raise HTTPException(status_code=400, detail="Crawler is already running")
        raise HTTPException(status_code=500, detail="Failed to start crawler")
@@ -39,10 +39,10 @@ async def start_crawler(request: CrawlerStartRequest):

@router.post("/stop")
 async def stop_crawler():
-    """停止爬虫任务"""
+    """Stop crawler task"""
    success = await crawler_manager.stop()
    if not success:
-        # 处理并发/重复请求：如果进程已退出/不存在，返回 400 而不是 500
+        # Handle concurrent/duplicate requests: if process already exited/doesn't exist, return 400 instead of 500
        if not crawler_manager.process or crawler_manager.process.poll() is not None:
            raise HTTPException(status_code=400, detail="No crawler is running")
        raise HTTPException(status_code=500, detail="Failed to stop crawler")
@@ -52,12 +52,12 @@ async def stop_crawler():

@router.get("/status", response_model=CrawlerStatusResponse)
 async def get_crawler_status():
-    """获取爬虫状态"""
+    """Get crawler status"""
    return crawler_manager.get_status()


@router.get("/logs")
 async def get_logs(limit: int = 100):
-    """获取最近的日志"""
+    """Get recent logs"""
    logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs
    return {"logs": [log.model_dump() for log in logs]}
--- a/api/routers/data.py
+++ b/api/routers/data.py
@@ -26,16 +26,16 @@ from fastapi.responses import FileResponse

 router = APIRouter(prefix="/data", tags=["data"])

-# 数据目录
+# Data directory
 DATA_DIR = Path(__file__).parent.parent.parent / "data"


 def get_file_info(file_path: Path) -> dict:
-    """获取文件信息"""
+    """Get file information"""
    stat = file_path.stat()
    record_count = None

-    # 尝试获取记录数
+    # Try to get record count
    try:
        if file_path.suffix == ".json":
            with open(file_path, "r", encoding="utf-8") as f:
@@ -44,7 +44,7 @@ def get_file_info(file_path: Path) -> dict:
                    record_count = len(data)
        elif file_path.suffix == ".csv":
            with open(file_path, "r", encoding="utf-8") as f:
-                record_count = sum(1 for _ in f) - 1  # 减去标题行
+                record_count = sum(1 for _ in f) - 1  # Subtract header row
    except Exception:
        pass

@@ -60,7 +60,7 @@ def get_file_info(file_path: Path) -> dict:

@router.get("/files")
 async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None):
-    """获取数据文件列表"""
+    """Get data file list"""
    if not DATA_DIR.exists():
        return {"files": []}

@@ -74,13 +74,13 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
            if file_path.suffix.lower() not in supported_extensions:
                continue

-            # 平台过滤
+            # Platform filter
            if platform:
                rel_path = str(file_path.relative_to(DATA_DIR))
                if platform.lower() not in rel_path.lower():
                    continue

-            # 类型过滤
+            # Type filter
            if file_type and file_path.suffix[1:].lower() != file_type.lower():
                continue

@@ -89,7 +89,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st
            except Exception:
                continue

-    # 按修改时间排序（最新的在前）
+    # Sort by modification time (newest first)
    files.sort(key=lambda x: x["modified_at"], reverse=True)

    return {"files": files}
@@ -97,7 +97,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st

@router.get("/files/{file_path:path}")
 async def get_file_content(file_path: str, preview: bool = True, limit: int = 100):
-    """获取文件内容或预览"""
+    """Get file content or preview"""
    full_path = DATA_DIR / file_path

    if not full_path.exists():
@@ -106,14 +106,14 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
    if not full_path.is_file():
        raise HTTPException(status_code=400, detail="Not a file")

-    # 安全检查：确保在 DATA_DIR 内
+    # Security check: ensure within DATA_DIR
    try:
        full_path.resolve().relative_to(DATA_DIR.resolve())
    except ValueError:
        raise HTTPException(status_code=403, detail="Access denied")

    if preview:
-        # 返回预览数据
+        # Return preview data
        try:
            if full_path.suffix == ".json":
                with open(full_path, "r", encoding="utf-8") as f:
@@ -130,18 +130,18 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
                        if i >= limit:
                            break
                        rows.append(row)
-                    # 重新读取获取总数
+                    # Re-read to get total count
                    f.seek(0)
                    total = sum(1 for _ in f) - 1
                    return {"data": rows, "total": total}
            elif full_path.suffix.lower() in (".xlsx", ".xls"):
                import pandas as pd
-                # 读取前 limit 行
+                # Read first limit rows
                df = pd.read_excel(full_path, nrows=limit)
-                # 获取总行数（只读取第一列来节省内存）
+                # Get total row count (only read first column to save memory)
                df_count = pd.read_excel(full_path, usecols=[0])
                total = len(df_count)
-                # 转换为字典列表，处理 NaN 值
+                # Convert to list of dictionaries, handle NaN values
                rows = df.where(pd.notnull(df), None).to_dict(orient='records')
                return {
                    "data": rows,
@@ -155,7 +155,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10
        except Exception as e:
            raise HTTPException(status_code=500, detail=str(e))
    else:
-        # 返回文件下载
+        # Return file download
        return FileResponse(
            path=full_path,
            filename=full_path.name,
@@ -165,7 +165,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10

@router.get("/download/{file_path:path}")
 async def download_file(file_path: str):
-    """下载文件"""
+    """Download file"""
    full_path = DATA_DIR / file_path

    if not full_path.exists():
@@ -174,7 +174,7 @@ async def download_file(file_path: str):
    if not full_path.is_file():
        raise HTTPException(status_code=400, detail="Not a file")

-    # 安全检查
+    # Security check
    try:
        full_path.resolve().relative_to(DATA_DIR.resolve())
    except ValueError:
@@ -189,7 +189,7 @@ async def download_file(file_path: str):

@router.get("/stats")
 async def get_data_stats():
-    """获取数据统计"""
+    """Get data statistics"""
    if not DATA_DIR.exists():
        return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}}

@@ -214,11 +214,11 @@ async def get_data_stats():
                stats["total_files"] += 1
                stats["total_size"] += stat.st_size

-                # 按类型统计
+                # Statistics by type
                file_type = file_path.suffix[1:].lower()
                stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1

-                # 按平台统计（从路径推断）
+                # Statistics by platform (inferred from path)
                rel_path = str(file_path.relative_to(DATA_DIR))
                for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]:
                    if platform in rel_path.lower():
--- a/api/routers/websocket.py
+++ b/api/routers/websocket.py
@@ -27,7 +27,7 @@ router = APIRouter(tags=["websocket"])


 class ConnectionManager:
-    """WebSocket 连接管理器"""
+    """WebSocket connection manager"""

    def __init__(self):
        self.active_connections: Set[WebSocket] = set()
@@ -40,7 +40,7 @@ class ConnectionManager:
        self.active_connections.discard(websocket)

    async def broadcast(self, message: dict):
-        """广播消息到所有连接"""
+        """Broadcast message to all connections"""
        if not self.active_connections:
            return

@@ -51,7 +51,7 @@ class ConnectionManager:
            except Exception:
                disconnected.append(connection)

-        # 清理断开的连接
+        # Clean up disconnected connections
        for conn in disconnected:
            self.disconnect(conn)

@@ -60,13 +60,13 @@ manager = ConnectionManager()


 async def log_broadcaster():
-    """后台任务：从队列读取日志并广播"""
+    """Background task: read logs from queue and broadcast"""
    queue = crawler_manager.get_log_queue()
    while True:
        try:
-            # 从队列获取日志条目
+            # Get log entry from queue
            entry = await queue.get()
-            # 广播到所有 WebSocket 连接
+            # Broadcast to all WebSocket connections
            await manager.broadcast(entry.model_dump())
        except asyncio.CancelledError:
            break
@@ -75,12 +75,12 @@ async def log_broadcaster():
            await asyncio.sleep(0.1)


-# 全局广播任务
+# Global broadcast task
 _broadcaster_task: Optional[asyncio.Task] = None


 def start_broadcaster():
-    """启动广播任务"""
+    """Start broadcast task"""
    global _broadcaster_task
    if _broadcaster_task is None or _broadcaster_task.done():
        _broadcaster_task = asyncio.create_task(log_broadcaster())
@@ -88,17 +88,17 @@ def start_broadcaster():

@router.websocket("/ws/logs")
 async def websocket_logs(websocket: WebSocket):
-    """WebSocket 日志流"""
+    """WebSocket log stream"""
    print("[WS] New connection attempt")

    try:
-        # 确保广播任务在运行
+        # Ensure broadcast task is running
        start_broadcaster()

        await manager.connect(websocket)
        print(f"[WS] Connected, active connections: {len(manager.active_connections)}")

-        # 发送现有日志
+        # Send existing logs
        for log in crawler_manager.logs:
            try:
                await websocket.send_json(log.model_dump())
@@ -109,7 +109,7 @@ async def websocket_logs(websocket: WebSocket):
        print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop")

        while True:
-            # 保持连接活跃，接收心跳或任意消息
+            # Keep connection alive, receive heartbeat or any message
            try:
                data = await asyncio.wait_for(
                    websocket.receive_text(),
@@ -118,7 +118,7 @@ async def websocket_logs(websocket: WebSocket):
                if data == "ping":
                    await websocket.send_text("pong")
            except asyncio.TimeoutError:
-                # 发送 ping 保持连接
+                # Send ping to keep connection alive
                try:
                    await websocket.send_text("ping")
                except Exception as e:
@@ -136,12 +136,12 @@ async def websocket_logs(websocket: WebSocket):

@router.websocket("/ws/status")
 async def websocket_status(websocket: WebSocket):
-    """WebSocket 状态流"""
+    """WebSocket status stream"""
    await websocket.accept()

    try:
        while True:
-            # 每秒发送一次状态
+            # Send status every second
            status = crawler_manager.get_status()
            await websocket.send_json(status)
            await asyncio.sleep(1)