i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-08 11:37:36 +08:00 · 2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions
--- a/api/services/crawler_manager.py
+++ b/api/services/crawler_manager.py
@@ -28,7 +28,7 @@ from ..schemas import CrawlerStartRequest, LogEntry


 class CrawlerManager:
-    """爬虫进程管理器"""
+    """Crawler process manager"""

    def __init__(self):
        self._lock = asyncio.Lock()
@@ -39,9 +39,9 @@ class CrawlerManager:
        self._log_id = 0
        self._logs: List[LogEntry] = []
        self._read_task: Optional[asyncio.Task] = None
-        # 项目根目录
+        # Project root directory
        self._project_root = Path(__file__).parent.parent.parent
-        # 日志队列 - 用于向 WebSocket 推送
+        # Log queue - for pushing to WebSocket
        self._log_queue: Optional[asyncio.Queue] = None

    @property
@@ -49,13 +49,13 @@ class CrawlerManager:
        return self._logs

    def get_log_queue(self) -> asyncio.Queue:
-        """获取或创建日志队列"""
+        """Get or create log queue"""
        if self._log_queue is None:
            self._log_queue = asyncio.Queue()
        return self._log_queue

    def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
-        """创建日志条目"""
+        """Create log entry"""
        self._log_id += 1
        entry = LogEntry(
            id=self._log_id,
@@ -64,13 +64,13 @@ class CrawlerManager:
            message=message
        )
        self._logs.append(entry)
-        # 保留最近 500 条日志
+        # Keep last 500 logs
        if len(self._logs) > 500:
            self._logs = self._logs[-500:]
        return entry

    async def _push_log(self, entry: LogEntry):
-        """推送日志到队列"""
+        """Push log to queue"""
        if self._log_queue is not None:
            try:
                self._log_queue.put_nowait(entry)
@@ -78,7 +78,7 @@ class CrawlerManager:
                pass

    def _parse_log_level(self, line: str) -> str:
-        """解析日志级别"""
+        """Parse log level"""
        line_upper = line.upper()
        if "ERROR" in line_upper or "FAILED" in line_upper:
            return "error"
@@ -91,16 +91,16 @@ class CrawlerManager:
        return "info"

    async def start(self, config: CrawlerStartRequest) -> bool:
-        """启动爬虫进程"""
+        """Start crawler process"""
        async with self._lock:
            if self.process and self.process.poll() is None:
                return False

-            # 清空旧日志
+            # Clear old logs
            self._logs = []
            self._log_id = 0

-            # 清空待推送队列（不要替换对象，避免 WebSocket 广播协程持有旧队列引用）
+            # Clear pending queue (don't replace object to avoid WebSocket broadcast coroutine holding old queue reference)
            if self._log_queue is None:
                self._log_queue = asyncio.Queue()
            else:
@@ -110,15 +110,15 @@ class CrawlerManager:
                except asyncio.QueueEmpty:
                    pass

-            # 构建命令行参数
+            # Build command line arguments
            cmd = self._build_command(config)

-            # 记录启动日志
+            # Log start information
            entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
            await self._push_log(entry)

            try:
-                # 启动子进程
+                # Start subprocess
                self.process = subprocess.Popen(
                    cmd,
                    stdout=subprocess.PIPE,
@@ -139,7 +139,7 @@ class CrawlerManager:
                )
                await self._push_log(entry)

-                # 启动日志读取任务
+                # Start log reading task
                self._read_task = asyncio.create_task(self._read_output())

                return True
@@ -150,7 +150,7 @@ class CrawlerManager:
                return False

    async def stop(self) -> bool:
-        """停止爬虫进程"""
+        """Stop crawler process"""
        async with self._lock:
            if not self.process or self.process.poll() is not None:
                return False
@@ -162,13 +162,13 @@ class CrawlerManager:
            try:
                self.process.send_signal(signal.SIGTERM)

-                # 等待优雅退出 (最多15秒)
+                # Wait for graceful exit (up to 15 seconds)
                for _ in range(30):
                    if self.process.poll() is not None:
                        break
                    await asyncio.sleep(0.5)

-                # 如果还没退出，强制杀死
+                # If still not exited, force kill
                if self.process.poll() is None:
                    entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
                    await self._push_log(entry)
@@ -184,7 +184,7 @@ class CrawlerManager:
            self.status = "idle"
            self.current_config = None

-            # 取消日志读取任务
+            # Cancel log reading task
            if self._read_task:
                self._read_task.cancel()
                self._read_task = None
@@ -192,7 +192,7 @@ class CrawlerManager:
            return True

    def get_status(self) -> dict:
-        """获取当前状态"""
+        """Get current status"""
        return {
            "status": self.status,
            "platform": self.current_config.platform.value if self.current_config else None,
@@ -202,7 +202,7 @@ class CrawlerManager:
        }

    def _build_command(self, config: CrawlerStartRequest) -> list:
-        """构建 main.py 命令行参数"""
+        """Build main.py command line arguments"""
        cmd = ["uv", "run", "python", "main.py"]

        cmd.extend(["--platform", config.platform.value])
@@ -210,7 +210,7 @@ class CrawlerManager:
        cmd.extend(["--type", config.crawler_type.value])
        cmd.extend(["--save_data_option", config.save_option.value])

-        # 根据爬虫类型传递不同的参数
+        # Pass different arguments based on crawler type
        if config.crawler_type.value == "search" and config.keywords:
            cmd.extend(["--keywords", config.keywords])
        elif config.crawler_type.value == "detail" and config.specified_ids:
@@ -232,12 +232,12 @@ class CrawlerManager:
        return cmd

    async def _read_output(self):
-        """异步读取进程输出"""
+        """Asynchronously read process output"""
        loop = asyncio.get_event_loop()

        try:
            while self.process and self.process.poll() is None:
-                # 在线程池中读取一行
+                # Read a line in thread pool
                line = await loop.run_in_executor(
                    None, self.process.stdout.readline
                )
@@ -248,7 +248,7 @@ class CrawlerManager:
                        entry = self._create_log_entry(line, level)
                        await self._push_log(entry)

-            # 读取剩余输出
+            # Read remaining output
            if self.process and self.process.stdout:
                remaining = await loop.run_in_executor(
                    None, self.process.stdout.read
@@ -260,7 +260,7 @@ class CrawlerManager:
                            entry = self._create_log_entry(line.strip(), level)
                            await self._push_log(entry)

-            # 进程结束
+            # Process ended
            if self.status == "running":
                exit_code = self.process.returncode if self.process else -1
                if exit_code == 0:
@@ -277,5 +277,5 @@ class CrawlerManager:
            await self._push_log(entry)


-# 全局单例
+# Global singleton
 crawler_manager = CrawlerManager()