i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase:

- api/: FastAPI server documentation and logger messages
- cache/: Cache abstraction layer comments and docstrings
- database/: Database models and MongoDB store documentation
- media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu)
- model/: Data model documentation
- proxy/: Proxy pool and provider documentation
- store/: Data storage layer comments
- tools/: Utility functions and browser automation
- test/: Test file documentation

Preserved: Chinese disclaimer header (lines 10-18) for legal compliance

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
程序员阿江(Relakkes)
2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions

View File

@@ -28,7 +28,7 @@ from ..schemas import CrawlerStartRequest, LogEntry
class CrawlerManager:
"""爬虫进程管理器"""
"""Crawler process manager"""
def __init__(self):
self._lock = asyncio.Lock()
@@ -39,9 +39,9 @@ class CrawlerManager:
self._log_id = 0
self._logs: List[LogEntry] = []
self._read_task: Optional[asyncio.Task] = None
# 项目根目录
# Project root directory
self._project_root = Path(__file__).parent.parent.parent
# 日志队列 - 用于向 WebSocket 推送
# Log queue - for pushing to WebSocket
self._log_queue: Optional[asyncio.Queue] = None
@property
@@ -49,13 +49,13 @@ class CrawlerManager:
return self._logs
def get_log_queue(self) -> asyncio.Queue:
"""获取或创建日志队列"""
"""Get or create log queue"""
if self._log_queue is None:
self._log_queue = asyncio.Queue()
return self._log_queue
def _create_log_entry(self, message: str, level: str = "info") -> LogEntry:
"""创建日志条目"""
"""Create log entry"""
self._log_id += 1
entry = LogEntry(
id=self._log_id,
@@ -64,13 +64,13 @@ class CrawlerManager:
message=message
)
self._logs.append(entry)
# 保留最近 500 条日志
# Keep last 500 logs
if len(self._logs) > 500:
self._logs = self._logs[-500:]
return entry
async def _push_log(self, entry: LogEntry):
"""推送日志到队列"""
"""Push log to queue"""
if self._log_queue is not None:
try:
self._log_queue.put_nowait(entry)
@@ -78,7 +78,7 @@ class CrawlerManager:
pass
def _parse_log_level(self, line: str) -> str:
"""解析日志级别"""
"""Parse log level"""
line_upper = line.upper()
if "ERROR" in line_upper or "FAILED" in line_upper:
return "error"
@@ -91,16 +91,16 @@ class CrawlerManager:
return "info"
async def start(self, config: CrawlerStartRequest) -> bool:
"""启动爬虫进程"""
"""Start crawler process"""
async with self._lock:
if self.process and self.process.poll() is None:
return False
# 清空旧日志
# Clear old logs
self._logs = []
self._log_id = 0
# 清空待推送队列(不要替换对象,避免 WebSocket 广播协程持有旧队列引用)
# Clear pending queue (don't replace object to avoid WebSocket broadcast coroutine holding old queue reference)
if self._log_queue is None:
self._log_queue = asyncio.Queue()
else:
@@ -110,15 +110,15 @@ class CrawlerManager:
except asyncio.QueueEmpty:
pass
# 构建命令行参数
# Build command line arguments
cmd = self._build_command(config)
# 记录启动日志
# Log start information
entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info")
await self._push_log(entry)
try:
# 启动子进程
# Start subprocess
self.process = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
@@ -139,7 +139,7 @@ class CrawlerManager:
)
await self._push_log(entry)
# 启动日志读取任务
# Start log reading task
self._read_task = asyncio.create_task(self._read_output())
return True
@@ -150,7 +150,7 @@ class CrawlerManager:
return False
async def stop(self) -> bool:
"""停止爬虫进程"""
"""Stop crawler process"""
async with self._lock:
if not self.process or self.process.poll() is not None:
return False
@@ -162,13 +162,13 @@ class CrawlerManager:
try:
self.process.send_signal(signal.SIGTERM)
# 等待优雅退出 (最多15秒)
# Wait for graceful exit (up to 15 seconds)
for _ in range(30):
if self.process.poll() is not None:
break
await asyncio.sleep(0.5)
# 如果还没退出,强制杀死
# If still not exited, force kill
if self.process.poll() is None:
entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning")
await self._push_log(entry)
@@ -184,7 +184,7 @@ class CrawlerManager:
self.status = "idle"
self.current_config = None
# 取消日志读取任务
# Cancel log reading task
if self._read_task:
self._read_task.cancel()
self._read_task = None
@@ -192,7 +192,7 @@ class CrawlerManager:
return True
def get_status(self) -> dict:
"""获取当前状态"""
"""Get current status"""
return {
"status": self.status,
"platform": self.current_config.platform.value if self.current_config else None,
@@ -202,7 +202,7 @@ class CrawlerManager:
}
def _build_command(self, config: CrawlerStartRequest) -> list:
"""构建 main.py 命令行参数"""
"""Build main.py command line arguments"""
cmd = ["uv", "run", "python", "main.py"]
cmd.extend(["--platform", config.platform.value])
@@ -210,7 +210,7 @@ class CrawlerManager:
cmd.extend(["--type", config.crawler_type.value])
cmd.extend(["--save_data_option", config.save_option.value])
# 根据爬虫类型传递不同的参数
# Pass different arguments based on crawler type
if config.crawler_type.value == "search" and config.keywords:
cmd.extend(["--keywords", config.keywords])
elif config.crawler_type.value == "detail" and config.specified_ids:
@@ -232,12 +232,12 @@ class CrawlerManager:
return cmd
async def _read_output(self):
"""异步读取进程输出"""
"""Asynchronously read process output"""
loop = asyncio.get_event_loop()
try:
while self.process and self.process.poll() is None:
# 在线程池中读取一行
# Read a line in thread pool
line = await loop.run_in_executor(
None, self.process.stdout.readline
)
@@ -248,7 +248,7 @@ class CrawlerManager:
entry = self._create_log_entry(line, level)
await self._push_log(entry)
# 读取剩余输出
# Read remaining output
if self.process and self.process.stdout:
remaining = await loop.run_in_executor(
None, self.process.stdout.read
@@ -260,7 +260,7 @@ class CrawlerManager:
entry = self._create_log_entry(line.strip(), level)
await self._push_log(entry)
# 进程结束
# Process ended
if self.status == "running":
exit_code = self.process.returncode if self.process else -1
if exit_code == 0:
@@ -277,5 +277,5 @@ class CrawlerManager:
await self._push_log(entry)
# 全局单例
# Global singleton
crawler_manager = CrawlerManager()