From 157ddfb21bd534109c0668ffeef9f643aa7c2d15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Fri, 26 Dec 2025 23:27:19 +0800 Subject: [PATCH] i18n: translate all Chinese comments, docstrings, and logger messages to English MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 --- api/main.py | 84 +++---- api/routers/crawler.py | 12 +- api/routers/data.py | 42 ++-- api/routers/websocket.py | 30 +-- api/schemas/crawler.py | 22 +- api/services/crawler_manager.py | 54 ++--- base/base_crawler.py | 14 +- cache/abs_cache.py | 24 +- cache/cache_factory.py | 12 +- cache/local_cache.py | 32 +-- cache/redis_cache.py | 16 +- cmd_arg/arg.py | 66 +++--- database/db.py | 6 +- database/models.py | 6 +- database/mongodb_store_base.py | 32 +-- main.py | 4 +- media_platform/bilibili/client.py | 108 ++++----- media_platform/bilibili/core.py | 60 ++--- media_platform/bilibili/field.py | 16 +- media_platform/bilibili/help.py | 54 ++--- media_platform/bilibili/login.py | 2 +- media_platform/douyin/field.py | 22 +- media_platform/douyin/help.py | 74 +++---- media_platform/douyin/login.py | 46 ++-- media_platform/kuaishou/client.py | 26 +-- media_platform/kuaishou/core.py | 24 +- media_platform/kuaishou/graphql.py | 4 +- media_platform/kuaishou/help.py | 52 ++--- media_platform/tieba/client.py | 282 ++++++++++++------------ media_platform/tieba/core.py | 122 +++++----- media_platform/tieba/field.py | 10 +- media_platform/tieba/help.py | 102 +++++---- media_platform/tieba/login.py | 2 +- media_platform/weibo/client.py | 74 +++---- media_platform/weibo/core.py | 56 ++--- media_platform/weibo/field.py | 8 +- media_platform/weibo/help.py | 6 +- media_platform/weibo/login.py | 2 +- media_platform/xhs/client.py | 166 +++++++------- media_platform/xhs/core.py | 68 +++--- media_platform/xhs/extractor.py | 14 +- media_platform/xhs/field.py | 39 ++-- media_platform/xhs/help.py | 34 +-- media_platform/xhs/login.py | 22 +- media_platform/xhs/playwright_sign.py | 88 ++++---- media_platform/xhs/xhs_sign.py | 26 +-- media_platform/zhihu/client.py | 98 ++++---- media_platform/zhihu/core.py | 32 +-- media_platform/zhihu/field.py | 34 +-- media_platform/zhihu/help.py | 8 +- model/m_baidu_tieba.py | 80 +++---- model/m_bilibili.py | 4 +- model/m_douyin.py | 4 +- model/m_kuaishou.py | 4 +- model/m_xiaohongshu.py | 2 +- model/m_zhihu.py | 98 ++++---- proxy/__init__.py | 2 +- proxy/base_proxy.py | 14 +- proxy/providers/jishu_http_proxy.py | 30 +-- proxy/providers/kuaidl_proxy.py | 42 ++-- proxy/providers/wandou_http_proxy.py | 36 +-- proxy/proxy_ip_pool.py | 40 ++-- proxy/proxy_mixin.py | 26 +-- proxy/types.py | 18 +- store/bilibili/_store_impl.py | 18 +- store/bilibili/bilibilli_store_media.py | 2 +- store/douyin/__init__.py | 40 ++-- store/douyin/_store_impl.py | 18 +- store/kuaishou/__init__.py | 2 +- store/kuaishou/_store_impl.py | 20 +- store/tieba/_store_impl.py | 20 +- store/weibo/__init__.py | 2 +- store/weibo/_store_impl.py | 20 +- store/weibo/weibo_store_media.py | 2 +- store/xhs/__init__.py | 106 ++++----- store/xhs/_store_impl.py | 18 +- store/xhs/xhs_store_media.py | 2 +- store/zhihu/__init__.py | 10 +- store/zhihu/_store_impl.py | 20 +- test/test_db_sync.py | 118 +++++----- test/test_expiring_local_cache.py | 4 +- test/test_mongodb_integration.py | 80 +++---- test/test_proxy_ip_pool.py | 206 ++++++++--------- test/test_redis_cache.py | 2 +- tests/conftest.py | 45 ++-- tools/app_runner.py | 8 +- tools/browser_launcher.py | 100 ++++----- tools/cdp_browser.py | 202 ++++++++--------- tools/crawler_util.py | 10 +- tools/file_header_manager.py | 128 +++++------ tools/slider_util.py | 52 ++--- tools/time_util.py | 32 +-- tools/utils.py | 2 +- 93 files changed, 1971 insertions(+), 1955 deletions(-) diff --git a/api/main.py b/api/main.py index e4526eb..539b4cf 100644 --- a/api/main.py +++ b/api/main.py @@ -18,8 +18,8 @@ """ MediaCrawler WebUI API Server -启动命令: uvicorn api.main:app --port 8080 --reload -或者: python -m api.main +Start command: uvicorn api.main:app --port 8080 --reload +Or: python -m api.main """ import asyncio import os @@ -38,15 +38,15 @@ app = FastAPI( version="1.0.0" ) -# 获取 webui 静态文件目录 +# Get webui static files directory WEBUI_DIR = os.path.join(os.path.dirname(__file__), "webui") -# CORS 配置 - 允许前端开发服务器访问 +# CORS configuration - allow frontend dev server access app.add_middleware( CORSMiddleware, allow_origins=[ "http://localhost:5173", # Vite dev server - "http://localhost:3000", # 备用端口 + "http://localhost:3000", # Backup port "http://127.0.0.1:5173", "http://127.0.0.1:3000", ], @@ -55,7 +55,7 @@ app.add_middleware( allow_headers=["*"], ) -# 注册路由 +# Register routers app.include_router(crawler_router, prefix="/api") app.include_router(data_router, prefix="/api") app.include_router(websocket_router, prefix="/api") @@ -63,7 +63,7 @@ app.include_router(websocket_router, prefix="/api") @app.get("/") async def serve_frontend(): - """返回前端页面""" + """Return frontend page""" index_path = os.path.join(WEBUI_DIR, "index.html") if os.path.exists(index_path): return FileResponse(index_path) @@ -82,103 +82,103 @@ async def health_check(): @app.get("/api/env/check") async def check_environment(): - """检测 MediaCrawler 环境是否配置正确""" + """Check if MediaCrawler environment is configured correctly""" try: - # 运行 uv run main.py --help 命令检测环境 + # Run uv run main.py --help command to check environment process = await asyncio.create_subprocess_exec( "uv", "run", "main.py", "--help", stdout=subprocess.PIPE, stderr=subprocess.PIPE, - cwd="." # 项目根目录 + cwd="." # Project root directory ) stdout, stderr = await asyncio.wait_for( process.communicate(), - timeout=30.0 # 30秒超时 + timeout=30.0 # 30 seconds timeout ) if process.returncode == 0: return { "success": True, - "message": "MediaCrawler 环境配置正确", - "output": stdout.decode("utf-8", errors="ignore")[:500] # 截取前500字符 + "message": "MediaCrawler environment configured correctly", + "output": stdout.decode("utf-8", errors="ignore")[:500] # Truncate to first 500 characters } else: error_msg = stderr.decode("utf-8", errors="ignore") or stdout.decode("utf-8", errors="ignore") return { "success": False, - "message": "环境检测失败", + "message": "Environment check failed", "error": error_msg[:500] } except asyncio.TimeoutError: return { "success": False, - "message": "环境检测超时", - "error": "命令执行超过30秒" + "message": "Environment check timeout", + "error": "Command execution exceeded 30 seconds" } except FileNotFoundError: return { "success": False, - "message": "未找到 uv 命令", - "error": "请确保已安装 uv 并配置到系统 PATH" + "message": "uv command not found", + "error": "Please ensure uv is installed and configured in system PATH" } except Exception as e: return { "success": False, - "message": "环境检测出错", + "message": "Environment check error", "error": str(e) } @app.get("/api/config/platforms") async def get_platforms(): - """获取支持的平台列表""" + """Get list of supported platforms""" return { "platforms": [ - {"value": "xhs", "label": "小红书", "icon": "book-open"}, - {"value": "dy", "label": "抖音", "icon": "music"}, - {"value": "ks", "label": "快手", "icon": "video"}, - {"value": "bili", "label": "哔哩哔哩", "icon": "tv"}, - {"value": "wb", "label": "微博", "icon": "message-circle"}, - {"value": "tieba", "label": "百度贴吧", "icon": "messages-square"}, - {"value": "zhihu", "label": "知乎", "icon": "help-circle"}, + {"value": "xhs", "label": "Xiaohongshu", "icon": "book-open"}, + {"value": "dy", "label": "Douyin", "icon": "music"}, + {"value": "ks", "label": "Kuaishou", "icon": "video"}, + {"value": "bili", "label": "Bilibili", "icon": "tv"}, + {"value": "wb", "label": "Weibo", "icon": "message-circle"}, + {"value": "tieba", "label": "Baidu Tieba", "icon": "messages-square"}, + {"value": "zhihu", "label": "Zhihu", "icon": "help-circle"}, ] } @app.get("/api/config/options") async def get_config_options(): - """获取所有配置选项""" + """Get all configuration options""" return { "login_types": [ - {"value": "qrcode", "label": "二维码登录"}, - {"value": "cookie", "label": "Cookie登录"}, + {"value": "qrcode", "label": "QR Code Login"}, + {"value": "cookie", "label": "Cookie Login"}, ], "crawler_types": [ - {"value": "search", "label": "搜索模式"}, - {"value": "detail", "label": "详情模式"}, - {"value": "creator", "label": "创作者模式"}, + {"value": "search", "label": "Search Mode"}, + {"value": "detail", "label": "Detail Mode"}, + {"value": "creator", "label": "Creator Mode"}, ], "save_options": [ - {"value": "json", "label": "JSON 文件"}, - {"value": "csv", "label": "CSV 文件"}, - {"value": "excel", "label": "Excel 文件"}, - {"value": "sqlite", "label": "SQLite 数据库"}, - {"value": "db", "label": "MySQL 数据库"}, - {"value": "mongodb", "label": "MongoDB 数据库"}, + {"value": "json", "label": "JSON File"}, + {"value": "csv", "label": "CSV File"}, + {"value": "excel", "label": "Excel File"}, + {"value": "sqlite", "label": "SQLite Database"}, + {"value": "db", "label": "MySQL Database"}, + {"value": "mongodb", "label": "MongoDB Database"}, ], } -# 挂载静态资源 - 必须放在所有路由之后 +# Mount static resources - must be placed after all routes if os.path.exists(WEBUI_DIR): assets_dir = os.path.join(WEBUI_DIR, "assets") if os.path.exists(assets_dir): app.mount("/assets", StaticFiles(directory=assets_dir), name="assets") - # 挂载 logos 目录 + # Mount logos directory logos_dir = os.path.join(WEBUI_DIR, "logos") if os.path.exists(logos_dir): app.mount("/logos", StaticFiles(directory=logos_dir), name="logos") - # 挂载其他静态文件(如 vite.svg) + # Mount other static files (e.g., vite.svg) app.mount("/static", StaticFiles(directory=WEBUI_DIR), name="webui-static") diff --git a/api/routers/crawler.py b/api/routers/crawler.py index b92f1d9..eead9e1 100644 --- a/api/routers/crawler.py +++ b/api/routers/crawler.py @@ -26,10 +26,10 @@ router = APIRouter(prefix="/crawler", tags=["crawler"]) @router.post("/start") async def start_crawler(request: CrawlerStartRequest): - """启动爬虫任务""" + """Start crawler task""" success = await crawler_manager.start(request) if not success: - # 处理并发/重复请求:如果进程已经在跑,返回 400 而不是 500 + # Handle concurrent/duplicate requests: if process is already running, return 400 instead of 500 if crawler_manager.process and crawler_manager.process.poll() is None: raise HTTPException(status_code=400, detail="Crawler is already running") raise HTTPException(status_code=500, detail="Failed to start crawler") @@ -39,10 +39,10 @@ async def start_crawler(request: CrawlerStartRequest): @router.post("/stop") async def stop_crawler(): - """停止爬虫任务""" + """Stop crawler task""" success = await crawler_manager.stop() if not success: - # 处理并发/重复请求:如果进程已退出/不存在,返回 400 而不是 500 + # Handle concurrent/duplicate requests: if process already exited/doesn't exist, return 400 instead of 500 if not crawler_manager.process or crawler_manager.process.poll() is not None: raise HTTPException(status_code=400, detail="No crawler is running") raise HTTPException(status_code=500, detail="Failed to stop crawler") @@ -52,12 +52,12 @@ async def stop_crawler(): @router.get("/status", response_model=CrawlerStatusResponse) async def get_crawler_status(): - """获取爬虫状态""" + """Get crawler status""" return crawler_manager.get_status() @router.get("/logs") async def get_logs(limit: int = 100): - """获取最近的日志""" + """Get recent logs""" logs = crawler_manager.logs[-limit:] if limit > 0 else crawler_manager.logs return {"logs": [log.model_dump() for log in logs]} diff --git a/api/routers/data.py b/api/routers/data.py index 4d35a47..7dc81af 100644 --- a/api/routers/data.py +++ b/api/routers/data.py @@ -26,16 +26,16 @@ from fastapi.responses import FileResponse router = APIRouter(prefix="/data", tags=["data"]) -# 数据目录 +# Data directory DATA_DIR = Path(__file__).parent.parent.parent / "data" def get_file_info(file_path: Path) -> dict: - """获取文件信息""" + """Get file information""" stat = file_path.stat() record_count = None - # 尝试获取记录数 + # Try to get record count try: if file_path.suffix == ".json": with open(file_path, "r", encoding="utf-8") as f: @@ -44,7 +44,7 @@ def get_file_info(file_path: Path) -> dict: record_count = len(data) elif file_path.suffix == ".csv": with open(file_path, "r", encoding="utf-8") as f: - record_count = sum(1 for _ in f) - 1 # 减去标题行 + record_count = sum(1 for _ in f) - 1 # Subtract header row except Exception: pass @@ -60,7 +60,7 @@ def get_file_info(file_path: Path) -> dict: @router.get("/files") async def list_data_files(platform: Optional[str] = None, file_type: Optional[str] = None): - """获取数据文件列表""" + """Get data file list""" if not DATA_DIR.exists(): return {"files": []} @@ -74,13 +74,13 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st if file_path.suffix.lower() not in supported_extensions: continue - # 平台过滤 + # Platform filter if platform: rel_path = str(file_path.relative_to(DATA_DIR)) if platform.lower() not in rel_path.lower(): continue - # 类型过滤 + # Type filter if file_type and file_path.suffix[1:].lower() != file_type.lower(): continue @@ -89,7 +89,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st except Exception: continue - # 按修改时间排序(最新的在前) + # Sort by modification time (newest first) files.sort(key=lambda x: x["modified_at"], reverse=True) return {"files": files} @@ -97,7 +97,7 @@ async def list_data_files(platform: Optional[str] = None, file_type: Optional[st @router.get("/files/{file_path:path}") async def get_file_content(file_path: str, preview: bool = True, limit: int = 100): - """获取文件内容或预览""" + """Get file content or preview""" full_path = DATA_DIR / file_path if not full_path.exists(): @@ -106,14 +106,14 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10 if not full_path.is_file(): raise HTTPException(status_code=400, detail="Not a file") - # 安全检查:确保在 DATA_DIR 内 + # Security check: ensure within DATA_DIR try: full_path.resolve().relative_to(DATA_DIR.resolve()) except ValueError: raise HTTPException(status_code=403, detail="Access denied") if preview: - # 返回预览数据 + # Return preview data try: if full_path.suffix == ".json": with open(full_path, "r", encoding="utf-8") as f: @@ -130,18 +130,18 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10 if i >= limit: break rows.append(row) - # 重新读取获取总数 + # Re-read to get total count f.seek(0) total = sum(1 for _ in f) - 1 return {"data": rows, "total": total} elif full_path.suffix.lower() in (".xlsx", ".xls"): import pandas as pd - # 读取前 limit 行 + # Read first limit rows df = pd.read_excel(full_path, nrows=limit) - # 获取总行数(只读取第一列来节省内存) + # Get total row count (only read first column to save memory) df_count = pd.read_excel(full_path, usecols=[0]) total = len(df_count) - # 转换为字典列表,处理 NaN 值 + # Convert to list of dictionaries, handle NaN values rows = df.where(pd.notnull(df), None).to_dict(orient='records') return { "data": rows, @@ -155,7 +155,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10 except Exception as e: raise HTTPException(status_code=500, detail=str(e)) else: - # 返回文件下载 + # Return file download return FileResponse( path=full_path, filename=full_path.name, @@ -165,7 +165,7 @@ async def get_file_content(file_path: str, preview: bool = True, limit: int = 10 @router.get("/download/{file_path:path}") async def download_file(file_path: str): - """下载文件""" + """Download file""" full_path = DATA_DIR / file_path if not full_path.exists(): @@ -174,7 +174,7 @@ async def download_file(file_path: str): if not full_path.is_file(): raise HTTPException(status_code=400, detail="Not a file") - # 安全检查 + # Security check try: full_path.resolve().relative_to(DATA_DIR.resolve()) except ValueError: @@ -189,7 +189,7 @@ async def download_file(file_path: str): @router.get("/stats") async def get_data_stats(): - """获取数据统计""" + """Get data statistics""" if not DATA_DIR.exists(): return {"total_files": 0, "total_size": 0, "by_platform": {}, "by_type": {}} @@ -214,11 +214,11 @@ async def get_data_stats(): stats["total_files"] += 1 stats["total_size"] += stat.st_size - # 按类型统计 + # Statistics by type file_type = file_path.suffix[1:].lower() stats["by_type"][file_type] = stats["by_type"].get(file_type, 0) + 1 - # 按平台统计(从路径推断) + # Statistics by platform (inferred from path) rel_path = str(file_path.relative_to(DATA_DIR)) for platform in ["xhs", "dy", "ks", "bili", "wb", "tieba", "zhihu"]: if platform in rel_path.lower(): diff --git a/api/routers/websocket.py b/api/routers/websocket.py index 9f2b02f..215d4ee 100644 --- a/api/routers/websocket.py +++ b/api/routers/websocket.py @@ -27,7 +27,7 @@ router = APIRouter(tags=["websocket"]) class ConnectionManager: - """WebSocket 连接管理器""" + """WebSocket connection manager""" def __init__(self): self.active_connections: Set[WebSocket] = set() @@ -40,7 +40,7 @@ class ConnectionManager: self.active_connections.discard(websocket) async def broadcast(self, message: dict): - """广播消息到所有连接""" + """Broadcast message to all connections""" if not self.active_connections: return @@ -51,7 +51,7 @@ class ConnectionManager: except Exception: disconnected.append(connection) - # 清理断开的连接 + # Clean up disconnected connections for conn in disconnected: self.disconnect(conn) @@ -60,13 +60,13 @@ manager = ConnectionManager() async def log_broadcaster(): - """后台任务:从队列读取日志并广播""" + """Background task: read logs from queue and broadcast""" queue = crawler_manager.get_log_queue() while True: try: - # 从队列获取日志条目 + # Get log entry from queue entry = await queue.get() - # 广播到所有 WebSocket 连接 + # Broadcast to all WebSocket connections await manager.broadcast(entry.model_dump()) except asyncio.CancelledError: break @@ -75,12 +75,12 @@ async def log_broadcaster(): await asyncio.sleep(0.1) -# 全局广播任务 +# Global broadcast task _broadcaster_task: Optional[asyncio.Task] = None def start_broadcaster(): - """启动广播任务""" + """Start broadcast task""" global _broadcaster_task if _broadcaster_task is None or _broadcaster_task.done(): _broadcaster_task = asyncio.create_task(log_broadcaster()) @@ -88,17 +88,17 @@ def start_broadcaster(): @router.websocket("/ws/logs") async def websocket_logs(websocket: WebSocket): - """WebSocket 日志流""" + """WebSocket log stream""" print("[WS] New connection attempt") try: - # 确保广播任务在运行 + # Ensure broadcast task is running start_broadcaster() await manager.connect(websocket) print(f"[WS] Connected, active connections: {len(manager.active_connections)}") - # 发送现有日志 + # Send existing logs for log in crawler_manager.logs: try: await websocket.send_json(log.model_dump()) @@ -109,7 +109,7 @@ async def websocket_logs(websocket: WebSocket): print(f"[WS] Sent {len(crawler_manager.logs)} existing logs, entering main loop") while True: - # 保持连接活跃,接收心跳或任意消息 + # Keep connection alive, receive heartbeat or any message try: data = await asyncio.wait_for( websocket.receive_text(), @@ -118,7 +118,7 @@ async def websocket_logs(websocket: WebSocket): if data == "ping": await websocket.send_text("pong") except asyncio.TimeoutError: - # 发送 ping 保持连接 + # Send ping to keep connection alive try: await websocket.send_text("ping") except Exception as e: @@ -136,12 +136,12 @@ async def websocket_logs(websocket: WebSocket): @router.websocket("/ws/status") async def websocket_status(websocket: WebSocket): - """WebSocket 状态流""" + """WebSocket status stream""" await websocket.accept() try: while True: - # 每秒发送一次状态 + # Send status every second status = crawler_manager.get_status() await websocket.send_json(status) await asyncio.sleep(1) diff --git a/api/schemas/crawler.py b/api/schemas/crawler.py index 2802002..76538fb 100644 --- a/api/schemas/crawler.py +++ b/api/schemas/crawler.py @@ -22,7 +22,7 @@ from pydantic import BaseModel class PlatformEnum(str, Enum): - """支持的媒体平台""" + """Supported media platforms""" XHS = "xhs" DOUYIN = "dy" KUAISHOU = "ks" @@ -33,21 +33,21 @@ class PlatformEnum(str, Enum): class LoginTypeEnum(str, Enum): - """登录方式""" + """Login method""" QRCODE = "qrcode" PHONE = "phone" COOKIE = "cookie" class CrawlerTypeEnum(str, Enum): - """爬虫类型""" + """Crawler type""" SEARCH = "search" DETAIL = "detail" CREATOR = "creator" class SaveDataOptionEnum(str, Enum): - """数据保存方式""" + """Data save option""" CSV = "csv" DB = "db" JSON = "json" @@ -57,13 +57,13 @@ class SaveDataOptionEnum(str, Enum): class CrawlerStartRequest(BaseModel): - """启动爬虫请求""" + """Crawler start request""" platform: PlatformEnum login_type: LoginTypeEnum = LoginTypeEnum.QRCODE crawler_type: CrawlerTypeEnum = CrawlerTypeEnum.SEARCH - keywords: str = "" # 搜索模式下的关键词 - specified_ids: str = "" # 详情模式下的帖子/视频ID列表,逗号分隔 - creator_ids: str = "" # 创作者模式下的创作者ID列表,逗号分隔 + keywords: str = "" # Keywords for search mode + specified_ids: str = "" # Post/video ID list for detail mode, comma-separated + creator_ids: str = "" # Creator ID list for creator mode, comma-separated start_page: int = 1 enable_comments: bool = True enable_sub_comments: bool = False @@ -73,7 +73,7 @@ class CrawlerStartRequest(BaseModel): class CrawlerStatusResponse(BaseModel): - """爬虫状态响应""" + """Crawler status response""" status: Literal["idle", "running", "stopping", "error"] platform: Optional[str] = None crawler_type: Optional[str] = None @@ -82,7 +82,7 @@ class CrawlerStatusResponse(BaseModel): class LogEntry(BaseModel): - """日志条目""" + """Log entry""" id: int timestamp: str level: Literal["info", "warning", "error", "success", "debug"] @@ -90,7 +90,7 @@ class LogEntry(BaseModel): class DataFileInfo(BaseModel): - """数据文件信息""" + """Data file information""" name: str path: str size: int diff --git a/api/services/crawler_manager.py b/api/services/crawler_manager.py index da4204f..2d0aad5 100644 --- a/api/services/crawler_manager.py +++ b/api/services/crawler_manager.py @@ -28,7 +28,7 @@ from ..schemas import CrawlerStartRequest, LogEntry class CrawlerManager: - """爬虫进程管理器""" + """Crawler process manager""" def __init__(self): self._lock = asyncio.Lock() @@ -39,9 +39,9 @@ class CrawlerManager: self._log_id = 0 self._logs: List[LogEntry] = [] self._read_task: Optional[asyncio.Task] = None - # 项目根目录 + # Project root directory self._project_root = Path(__file__).parent.parent.parent - # 日志队列 - 用于向 WebSocket 推送 + # Log queue - for pushing to WebSocket self._log_queue: Optional[asyncio.Queue] = None @property @@ -49,13 +49,13 @@ class CrawlerManager: return self._logs def get_log_queue(self) -> asyncio.Queue: - """获取或创建日志队列""" + """Get or create log queue""" if self._log_queue is None: self._log_queue = asyncio.Queue() return self._log_queue def _create_log_entry(self, message: str, level: str = "info") -> LogEntry: - """创建日志条目""" + """Create log entry""" self._log_id += 1 entry = LogEntry( id=self._log_id, @@ -64,13 +64,13 @@ class CrawlerManager: message=message ) self._logs.append(entry) - # 保留最近 500 条日志 + # Keep last 500 logs if len(self._logs) > 500: self._logs = self._logs[-500:] return entry async def _push_log(self, entry: LogEntry): - """推送日志到队列""" + """Push log to queue""" if self._log_queue is not None: try: self._log_queue.put_nowait(entry) @@ -78,7 +78,7 @@ class CrawlerManager: pass def _parse_log_level(self, line: str) -> str: - """解析日志级别""" + """Parse log level""" line_upper = line.upper() if "ERROR" in line_upper or "FAILED" in line_upper: return "error" @@ -91,16 +91,16 @@ class CrawlerManager: return "info" async def start(self, config: CrawlerStartRequest) -> bool: - """启动爬虫进程""" + """Start crawler process""" async with self._lock: if self.process and self.process.poll() is None: return False - # 清空旧日志 + # Clear old logs self._logs = [] self._log_id = 0 - # 清空待推送队列(不要替换对象,避免 WebSocket 广播协程持有旧队列引用) + # Clear pending queue (don't replace object to avoid WebSocket broadcast coroutine holding old queue reference) if self._log_queue is None: self._log_queue = asyncio.Queue() else: @@ -110,15 +110,15 @@ class CrawlerManager: except asyncio.QueueEmpty: pass - # 构建命令行参数 + # Build command line arguments cmd = self._build_command(config) - # 记录启动日志 + # Log start information entry = self._create_log_entry(f"Starting crawler: {' '.join(cmd)}", "info") await self._push_log(entry) try: - # 启动子进程 + # Start subprocess self.process = subprocess.Popen( cmd, stdout=subprocess.PIPE, @@ -139,7 +139,7 @@ class CrawlerManager: ) await self._push_log(entry) - # 启动日志读取任务 + # Start log reading task self._read_task = asyncio.create_task(self._read_output()) return True @@ -150,7 +150,7 @@ class CrawlerManager: return False async def stop(self) -> bool: - """停止爬虫进程""" + """Stop crawler process""" async with self._lock: if not self.process or self.process.poll() is not None: return False @@ -162,13 +162,13 @@ class CrawlerManager: try: self.process.send_signal(signal.SIGTERM) - # 等待优雅退出 (最多15秒) + # Wait for graceful exit (up to 15 seconds) for _ in range(30): if self.process.poll() is not None: break await asyncio.sleep(0.5) - # 如果还没退出,强制杀死 + # If still not exited, force kill if self.process.poll() is None: entry = self._create_log_entry("Process not responding, sending SIGKILL...", "warning") await self._push_log(entry) @@ -184,7 +184,7 @@ class CrawlerManager: self.status = "idle" self.current_config = None - # 取消日志读取任务 + # Cancel log reading task if self._read_task: self._read_task.cancel() self._read_task = None @@ -192,7 +192,7 @@ class CrawlerManager: return True def get_status(self) -> dict: - """获取当前状态""" + """Get current status""" return { "status": self.status, "platform": self.current_config.platform.value if self.current_config else None, @@ -202,7 +202,7 @@ class CrawlerManager: } def _build_command(self, config: CrawlerStartRequest) -> list: - """构建 main.py 命令行参数""" + """Build main.py command line arguments""" cmd = ["uv", "run", "python", "main.py"] cmd.extend(["--platform", config.platform.value]) @@ -210,7 +210,7 @@ class CrawlerManager: cmd.extend(["--type", config.crawler_type.value]) cmd.extend(["--save_data_option", config.save_option.value]) - # 根据爬虫类型传递不同的参数 + # Pass different arguments based on crawler type if config.crawler_type.value == "search" and config.keywords: cmd.extend(["--keywords", config.keywords]) elif config.crawler_type.value == "detail" and config.specified_ids: @@ -232,12 +232,12 @@ class CrawlerManager: return cmd async def _read_output(self): - """异步读取进程输出""" + """Asynchronously read process output""" loop = asyncio.get_event_loop() try: while self.process and self.process.poll() is None: - # 在线程池中读取一行 + # Read a line in thread pool line = await loop.run_in_executor( None, self.process.stdout.readline ) @@ -248,7 +248,7 @@ class CrawlerManager: entry = self._create_log_entry(line, level) await self._push_log(entry) - # 读取剩余输出 + # Read remaining output if self.process and self.process.stdout: remaining = await loop.run_in_executor( None, self.process.stdout.read @@ -260,7 +260,7 @@ class CrawlerManager: entry = self._create_log_entry(line.strip(), level) await self._push_log(entry) - # 进程结束 + # Process ended if self.status == "running": exit_code = self.process.returncode if self.process else -1 if exit_code == 0: @@ -277,5 +277,5 @@ class CrawlerManager: await self._push_log(entry) -# 全局单例 +# Global singleton crawler_manager = CrawlerManager() diff --git a/base/base_crawler.py b/base/base_crawler.py index c00cf38..4fe9eaa 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -53,14 +53,14 @@ class AbstractCrawler(ABC): async def launch_browser_with_cdp(self, playwright: Playwright, playwright_proxy: Optional[Dict], user_agent: Optional[str], headless: bool = True) -> BrowserContext: """ - 使用CDP模式启动浏览器(可选实现) - :param playwright: playwright实例 - :param playwright_proxy: playwright代理配置 - :param user_agent: 用户代理 - :param headless: 无头模式 - :return: 浏览器上下文 + Launch browser using CDP mode (optional implementation) + :param playwright: playwright instance + :param playwright_proxy: playwright proxy configuration + :param user_agent: user agent + :param headless: headless mode + :return: browser context """ - # 默认实现:回退到标准模式 + # Default implementation: fallback to standard mode return await self.launch_browser(playwright.chromium, playwright_proxy, user_agent, headless) diff --git a/cache/abs_cache.py b/cache/abs_cache.py index 2c9f2b0..c620af5 100644 --- a/cache/abs_cache.py +++ b/cache/abs_cache.py @@ -20,9 +20,9 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com -# @Name : 程序员阿江-Relakkes +# @Name : Programmer AJiang-Relakkes # @Time : 2024/6/2 11:06 -# @Desc : 抽象类 +# @Desc : Abstract class from abc import ABC, abstractmethod from typing import Any, List, Optional @@ -33,9 +33,9 @@ class AbstractCache(ABC): @abstractmethod def get(self, key: str) -> Optional[Any]: """ - 从缓存中获取键的值。 - 这是一个抽象方法。子类必须实现这个方法。 - :param key: 键 + Get the value of a key from the cache. + This is an abstract method. Subclasses must implement this method. + :param key: The key :return: """ raise NotImplementedError @@ -43,11 +43,11 @@ class AbstractCache(ABC): @abstractmethod def set(self, key: str, value: Any, expire_time: int) -> None: """ - 将键的值设置到缓存中。 - 这是一个抽象方法。子类必须实现这个方法。 - :param key: 键 - :param value: 值 - :param expire_time: 过期时间 + Set the value of a key in the cache. + This is an abstract method. Subclasses must implement this method. + :param key: The key + :param value: The value + :param expire_time: Expiration time :return: """ raise NotImplementedError @@ -55,8 +55,8 @@ class AbstractCache(ABC): @abstractmethod def keys(self, pattern: str) -> List[str]: """ - 获取所有符合pattern的key - :param pattern: 匹配模式 + Get all keys matching the pattern + :param pattern: Matching pattern :return: """ raise NotImplementedError diff --git a/cache/cache_factory.py b/cache/cache_factory.py index 0bae5ea..3e9e598 100644 --- a/cache/cache_factory.py +++ b/cache/cache_factory.py @@ -20,23 +20,23 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com -# @Name : 程序员阿江-Relakkes +# @Name : Programmer AJiang-Relakkes # @Time : 2024/6/2 11:23 # @Desc : class CacheFactory: """ - 缓存工厂类 + Cache factory class """ @staticmethod def create_cache(cache_type: str, *args, **kwargs): """ - 创建缓存对象 - :param cache_type: 缓存类型 - :param args: 参数 - :param kwargs: 关键字参数 + Create cache object + :param cache_type: Cache type + :param args: Arguments + :param kwargs: Keyword arguments :return: """ if cache_type == 'memory': diff --git a/cache/local_cache.py b/cache/local_cache.py index 127f025..f3f9a81 100644 --- a/cache/local_cache.py +++ b/cache/local_cache.py @@ -20,9 +20,9 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com -# @Name : 程序员阿江-Relakkes +# @Name : Programmer AJiang-Relakkes # @Time : 2024/6/2 11:05 -# @Desc : 本地缓存 +# @Desc : Local cache import asyncio import time @@ -35,19 +35,19 @@ class ExpiringLocalCache(AbstractCache): def __init__(self, cron_interval: int = 10): """ - 初始化本地缓存 - :param cron_interval: 定时清楚cache的时间间隔 + Initialize local cache + :param cron_interval: Time interval for scheduled cache cleanup :return: """ self._cron_interval = cron_interval self._cache_container: Dict[str, Tuple[Any, float]] = {} self._cron_task: Optional[asyncio.Task] = None - # 开启定时清理任务 + # Start scheduled cleanup task self._schedule_clear() def __del__(self): """ - 析构函数,清理定时任务 + Destructor function, cleanup scheduled task :return: """ if self._cron_task is not None: @@ -55,7 +55,7 @@ class ExpiringLocalCache(AbstractCache): def get(self, key: str) -> Optional[Any]: """ - 从缓存中获取键的值 + Get the value of a key from the cache :param key: :return: """ @@ -63,7 +63,7 @@ class ExpiringLocalCache(AbstractCache): if value is None: return None - # 如果键已过期,则删除键并返回None + # If the key has expired, delete it and return None if expire_time < time.time(): del self._cache_container[key] return None @@ -72,7 +72,7 @@ class ExpiringLocalCache(AbstractCache): def set(self, key: str, value: Any, expire_time: int) -> None: """ - 将键的值设置到缓存中 + Set the value of a key in the cache :param key: :param value: :param expire_time: @@ -82,14 +82,14 @@ class ExpiringLocalCache(AbstractCache): def keys(self, pattern: str) -> List[str]: """ - 获取所有符合pattern的key - :param pattern: 匹配模式 + Get all keys matching the pattern + :param pattern: Matching pattern :return: """ if pattern == '*': return list(self._cache_container.keys()) - # 本地缓存通配符暂时将*替换为空 + # For local cache wildcard, temporarily replace * with empty string if '*' in pattern: pattern = pattern.replace('*', '') @@ -97,7 +97,7 @@ class ExpiringLocalCache(AbstractCache): def _schedule_clear(self): """ - 开启定时清理任务, + Start scheduled cleanup task :return: """ @@ -111,7 +111,7 @@ class ExpiringLocalCache(AbstractCache): def _clear(self): """ - 根据过期时间清理缓存 + Clean up cache based on expiration time :return: """ for key, (value, expire_time) in self._cache_container.items(): @@ -120,7 +120,7 @@ class ExpiringLocalCache(AbstractCache): async def _start_clear_cron(self): """ - 开启定时清理任务 + Start scheduled cleanup task :return: """ while True: @@ -130,7 +130,7 @@ class ExpiringLocalCache(AbstractCache): if __name__ == '__main__': cache = ExpiringLocalCache(cron_interval=2) - cache.set('name', '程序员阿江-Relakkes', 3) + cache.set('name', 'Programmer AJiang-Relakkes', 3) print(cache.get('key')) print(cache.keys("*")) time.sleep(4) diff --git a/cache/redis_cache.py b/cache/redis_cache.py index f8710a7..1ea4c0b 100644 --- a/cache/redis_cache.py +++ b/cache/redis_cache.py @@ -20,9 +20,9 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com -# @Name : 程序员阿江-Relakkes +# @Name : Programmer AJiang-Relakkes # @Time : 2024/5/29 22:57 -# @Desc : RedisCache实现 +# @Desc : RedisCache implementation import pickle import time from typing import Any, List @@ -36,13 +36,13 @@ from config import db_config class RedisCache(AbstractCache): def __init__(self) -> None: - # 连接redis, 返回redis客户端 + # Connect to redis, return redis client self._redis_client = self._connet_redis() @staticmethod def _connet_redis() -> Redis: """ - 连接redis, 返回redis客户端, 这里按需配置redis连接信息 + Connect to redis, return redis client, configure redis connection information as needed :return: """ return Redis( @@ -54,7 +54,7 @@ class RedisCache(AbstractCache): def get(self, key: str) -> Any: """ - 从缓存中获取键的值, 并且反序列化 + Get the value of a key from the cache and deserialize it :param key: :return: """ @@ -65,7 +65,7 @@ class RedisCache(AbstractCache): def set(self, key: str, value: Any, expire_time: int) -> None: """ - 将键的值设置到缓存中, 并且序列化 + Set the value of a key in the cache and serialize it :param key: :param value: :param expire_time: @@ -75,7 +75,7 @@ class RedisCache(AbstractCache): def keys(self, pattern: str) -> List[str]: """ - 获取所有符合pattern的key + Get all keys matching the pattern """ return [key.decode() for key in self._redis_client.keys(pattern)] @@ -83,7 +83,7 @@ class RedisCache(AbstractCache): if __name__ == '__main__': redis_cache = RedisCache() # basic usage - redis_cache.set("name", "程序员阿江-Relakkes", 1) + redis_cache.set("name", "Programmer AJiang-Relakkes", 1) print(redis_cache.get("name")) # Relakkes print(redis_cache.keys("*")) # ['name'] time.sleep(2) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index b29edfc..2517c24 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -37,7 +37,7 @@ EnumT = TypeVar("EnumT", bound=Enum) class PlatformEnum(str, Enum): - """支持的媒体平台枚举""" + """Supported media platform enumeration""" XHS = "xhs" DOUYIN = "dy" @@ -49,7 +49,7 @@ class PlatformEnum(str, Enum): class LoginTypeEnum(str, Enum): - """登录方式枚举""" + """Login type enumeration""" QRCODE = "qrcode" PHONE = "phone" @@ -57,7 +57,7 @@ class LoginTypeEnum(str, Enum): class CrawlerTypeEnum(str, Enum): - """爬虫类型枚举""" + """Crawler type enumeration""" SEARCH = "search" DETAIL = "detail" @@ -65,7 +65,7 @@ class CrawlerTypeEnum(str, Enum): class SaveDataOptionEnum(str, Enum): - """数据保存方式枚举""" + """Data save option enumeration""" CSV = "csv" DB = "db" @@ -76,7 +76,7 @@ class SaveDataOptionEnum(str, Enum): class InitDbOptionEnum(str, Enum): - """数据库初始化选项""" + """Database initialization option""" SQLITE = "sqlite" MYSQL = "mysql" @@ -102,7 +102,7 @@ def _coerce_enum( return enum_cls(value) except ValueError: typer.secho( - f"⚠️ 配置值 '{value}' 不在 {enum_cls.__name__} 支持的范围内,已回退到默认值 '{default.value}'.", + f"⚠️ Config value '{value}' is not within the supported range of {enum_cls.__name__}, falling back to default value '{default.value}'.", fg=typer.colors.YELLOW, ) return default @@ -133,7 +133,7 @@ def _inject_init_db_default(args: Sequence[str]) -> list[str]: async def parse_cmd(argv: Optional[Sequence[str]] = None): - """使用 Typer 解析命令行参数。""" + """Parse command line arguments using Typer.""" app = typer.Typer(add_completion=False) @@ -143,48 +143,48 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): PlatformEnum, typer.Option( "--platform", - help="媒体平台选择 (xhs=小红书 | dy=抖音 | ks=快手 | bili=哔哩哔哩 | wb=微博 | tieba=百度贴吧 | zhihu=知乎)", - rich_help_panel="基础配置", + help="Media platform selection (xhs=XiaoHongShu | dy=Douyin | ks=Kuaishou | bili=Bilibili | wb=Weibo | tieba=Baidu Tieba | zhihu=Zhihu)", + rich_help_panel="Basic Configuration", ), ] = _coerce_enum(PlatformEnum, config.PLATFORM, PlatformEnum.XHS), lt: Annotated[ LoginTypeEnum, typer.Option( "--lt", - help="登录方式 (qrcode=二维码 | phone=手机号 | cookie=Cookie)", - rich_help_panel="账号配置", + help="Login type (qrcode=QR Code | phone=Phone | cookie=Cookie)", + rich_help_panel="Account Configuration", ), ] = _coerce_enum(LoginTypeEnum, config.LOGIN_TYPE, LoginTypeEnum.QRCODE), crawler_type: Annotated[ CrawlerTypeEnum, typer.Option( "--type", - help="爬取类型 (search=搜索 | detail=详情 | creator=创作者)", - rich_help_panel="基础配置", + help="Crawler type (search=Search | detail=Detail | creator=Creator)", + rich_help_panel="Basic Configuration", ), ] = _coerce_enum(CrawlerTypeEnum, config.CRAWLER_TYPE, CrawlerTypeEnum.SEARCH), start: Annotated[ int, typer.Option( "--start", - help="起始页码", - rich_help_panel="基础配置", + help="Starting page number", + rich_help_panel="Basic Configuration", ), ] = config.START_PAGE, keywords: Annotated[ str, typer.Option( "--keywords", - help="请输入关键词,多个关键词用逗号分隔", - rich_help_panel="基础配置", + help="Enter keywords, multiple keywords separated by commas", + rich_help_panel="Basic Configuration", ), ] = config.KEYWORDS, get_comment: Annotated[ str, typer.Option( "--get_comment", - help="是否爬取一级评论,支持 yes/true/t/y/1 或 no/false/f/n/0", - rich_help_panel="评论配置", + help="Whether to crawl first-level comments, supports yes/true/t/y/1 or no/false/f/n/0", + rich_help_panel="Comment Configuration", show_default=True, ), ] = str(config.ENABLE_GET_COMMENTS), @@ -192,8 +192,8 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): str, typer.Option( "--get_sub_comment", - help="是否爬取二级评论,支持 yes/true/t/y/1 或 no/false/f/n/0", - rich_help_panel="评论配置", + help="Whether to crawl second-level comments, supports yes/true/t/y/1 or no/false/f/n/0", + rich_help_panel="Comment Configuration", show_default=True, ), ] = str(config.ENABLE_GET_SUB_COMMENTS), @@ -201,8 +201,8 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): str, typer.Option( "--headless", - help="是否启用无头模式(对 Playwright 和 CDP 均生效),支持 yes/true/t/y/1 或 no/false/f/n/0", - rich_help_panel="运行配置", + help="Whether to enable headless mode (applies to both Playwright and CDP), supports yes/true/t/y/1 or no/false/f/n/0", + rich_help_panel="Runtime Configuration", show_default=True, ), ] = str(config.HEADLESS), @@ -210,8 +210,8 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): SaveDataOptionEnum, typer.Option( "--save_data_option", - help="数据保存方式 (csv=CSV文件 | db=MySQL数据库 | json=JSON文件 | sqlite=SQLite数据库 | mongodb=MongoDB数据库 | excel=Excel文件)", - rich_help_panel="存储配置", + help="Data save option (csv=CSV file | db=MySQL database | json=JSON file | sqlite=SQLite database | mongodb=MongoDB database | excel=Excel file)", + rich_help_panel="Storage Configuration", ), ] = _coerce_enum( SaveDataOptionEnum, config.SAVE_DATA_OPTION, SaveDataOptionEnum.JSON @@ -220,32 +220,32 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): Optional[InitDbOptionEnum], typer.Option( "--init_db", - help="初始化数据库表结构 (sqlite | mysql)", - rich_help_panel="存储配置", + help="Initialize database table structure (sqlite | mysql)", + rich_help_panel="Storage Configuration", ), ] = None, cookies: Annotated[ str, typer.Option( "--cookies", - help="Cookie 登录方式使用的 Cookie 值", - rich_help_panel="账号配置", + help="Cookie value used for Cookie login method", + rich_help_panel="Account Configuration", ), ] = config.COOKIES, specified_id: Annotated[ str, typer.Option( "--specified_id", - help="详情模式下的帖子/视频ID列表,多个ID用逗号分隔(支持完整URL或ID)", - rich_help_panel="基础配置", + help="Post/video ID list in detail mode, multiple IDs separated by commas (supports full URL or ID)", + rich_help_panel="Basic Configuration", ), ] = "", creator_id: Annotated[ str, typer.Option( "--creator_id", - help="创作者模式下的创作者ID列表,多个ID用逗号分隔(支持完整URL或ID)", - rich_help_panel="基础配置", + help="Creator ID list in creator mode, multiple IDs separated by commas (supports full URL or ID)", + rich_help_panel="Basic Configuration", ), ] = "", ) -> SimpleNamespace: diff --git a/database/db.py b/database/db.py index 2533c38..0a8f6cd 100644 --- a/database/db.py +++ b/database/db.py @@ -17,9 +17,9 @@ # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 # persist-1 -# 原因:将 db.py 改造为模块,移除直接执行入口,修复相对导入问题。 -# 副作用:无 -# 回滚策略:还原此文件。 +# Reason: Refactored db.py into a module, removed direct execution entry point, fixed relative import issues. +# Side effects: None +# Rollback strategy: Restore this file. import asyncio import sys from pathlib import Path diff --git a/database/models.py b/database/models.py index 2dd2f75..22462d4 100644 --- a/database/models.py +++ b/database/models.py @@ -406,9 +406,9 @@ class ZhihuContent(Base): last_modify_ts = Column(BigInteger) # persist-1 - # 原因:修复 ORM 模型定义错误,确保与数据库表结构一致。 - # 副作用:无 - # 回滚策略:还原此行 + # Reason: Fixed ORM model definition error, ensuring consistency with database table structure. + # Side effects: None + # Rollback strategy: Restore this line class ZhihuComment(Base): __tablename__ = 'zhihu_comment' diff --git a/database/mongodb_store_base.py b/database/mongodb_store_base.py index 2ea72f9..771974a 100644 --- a/database/mongodb_store_base.py +++ b/database/mongodb_store_base.py @@ -16,7 +16,7 @@ # 详细许可条款请参阅项目根目录下的LICENSE文件。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 -"""MongoDB存储基类:提供连接管理和通用存储方法""" +"""MongoDB storage base class: Provides connection management and common storage methods""" import asyncio from typing import Dict, List, Optional from motor.motor_asyncio import AsyncIOMotorClient, AsyncIOMotorDatabase, AsyncIOMotorCollection @@ -25,7 +25,7 @@ from tools import utils class MongoDBConnection: - """MongoDB连接管理(单例模式)""" + """MongoDB connection management (singleton pattern)""" _instance = None _client: Optional[AsyncIOMotorClient] = None _db: Optional[AsyncIOMotorDatabase] = None @@ -37,7 +37,7 @@ class MongoDBConnection: return cls._instance async def get_client(self) -> AsyncIOMotorClient: - """获取客户端""" + """Get client""" if self._client is None: async with self._lock: if self._client is None: @@ -45,7 +45,7 @@ class MongoDBConnection: return self._client async def get_db(self) -> AsyncIOMotorDatabase: - """获取数据库""" + """Get database""" if self._db is None: async with self._lock: if self._db is None: @@ -53,7 +53,7 @@ class MongoDBConnection: return self._db async def _connect(self): - """建立连接""" + """Establish connection""" try: mongo_config = db_config.mongodb_config host = mongo_config["host"] @@ -62,14 +62,14 @@ class MongoDBConnection: password = mongo_config["password"] db_name = mongo_config["db_name"] - # 构建连接URL(有认证/无认证) + # Build connection URL (with/without authentication) if user and password: connection_url = f"mongodb://{user}:{password}@{host}:{port}/" else: connection_url = f"mongodb://{host}:{port}/" self._client = AsyncIOMotorClient(connection_url, serverSelectionTimeoutMS=5000) - await self._client.server_info() # 测试连接 + await self._client.server_info() # Test connection self._db = self._client[db_name] utils.logger.info(f"[MongoDBConnection] Connected to {host}:{port}/{db_name}") except Exception as e: @@ -77,7 +77,7 @@ class MongoDBConnection: raise async def close(self): - """关闭连接""" + """Close connection""" if self._client is not None: self._client.close() self._client = None @@ -86,24 +86,24 @@ class MongoDBConnection: class MongoDBStoreBase: - """MongoDB存储基类:提供通用的CRUD操作""" + """MongoDB storage base class: Provides common CRUD operations""" def __init__(self, collection_prefix: str): - """初始化存储基类 + """Initialize storage base class Args: - collection_prefix: 平台前缀(xhs/douyin/bilibili等) + collection_prefix: Platform prefix (xhs/douyin/bilibili, etc.) """ self.collection_prefix = collection_prefix self._connection = MongoDBConnection() async def get_collection(self, collection_suffix: str) -> AsyncIOMotorCollection: - """获取集合:{prefix}_{suffix}""" + """Get collection: {prefix}_{suffix}""" db = await self._connection.get_db() collection_name = f"{self.collection_prefix}_{collection_suffix}" return db[collection_name] async def save_or_update(self, collection_suffix: str, query: Dict, data: Dict) -> bool: - """保存或更新数据(upsert)""" + """Save or update data (upsert)""" try: collection = await self.get_collection(collection_suffix) await collection.update_one(query, {"$set": data}, upsert=True) @@ -113,7 +113,7 @@ class MongoDBStoreBase: return False async def find_one(self, collection_suffix: str, query: Dict) -> Optional[Dict]: - """查询单条数据""" + """Query a single record""" try: collection = await self.get_collection(collection_suffix) return await collection.find_one(query) @@ -122,7 +122,7 @@ class MongoDBStoreBase: return None async def find_many(self, collection_suffix: str, query: Dict, limit: int = 0) -> List[Dict]: - """查询多条数据(limit=0表示不限制)""" + """Query multiple records (limit=0 means no limit)""" try: collection = await self.get_collection(collection_suffix) cursor = collection.find(query) @@ -134,7 +134,7 @@ class MongoDBStoreBase: return [] async def create_index(self, collection_suffix: str, keys: List[tuple], unique: bool = False): - """创建索引:keys=[("field", 1)]""" + """Create index: keys=[("field", 1)]""" try: collection = await self.get_collection(collection_suffix) await collection.create_index(keys, unique=unique) diff --git a/main.py b/main.py index 1eb52a9..23d934d 100644 --- a/main.py +++ b/main.py @@ -114,7 +114,7 @@ async def async_cleanup() -> None: except Exception as e: error_msg = str(e).lower() if "closed" not in error_msg and "disconnected" not in error_msg: - print(f"[Main] 清理CDP浏览器时出错: {e}") + print(f"[Main] Error cleaning up CDP browser: {e}") elif getattr(crawler, "browser_context", None): try: @@ -122,7 +122,7 @@ async def async_cleanup() -> None: except Exception as e: error_msg = str(e).lower() if "closed" not in error_msg and "disconnected" not in error_msg: - print(f"[Main] 关闭浏览器上下文时出错: {e}") + print(f"[Main] Error closing browser context: {e}") if config.SAVE_DATA_OPTION in ("db", "sqlite"): await db.close() diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 641e281..45c927c 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -20,7 +20,7 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 18:44 -# @Desc : bilibili 请求客户端 +# @Desc : bilibili request client import asyncio import json import random @@ -47,7 +47,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): def __init__( self, - timeout=60, # 若开启爬取媒体选项,b 站的长视频需要更久的超时时间 + timeout=60, # For media crawling, Bilibili long videos need a longer timeout proxy=None, *, headers: Dict[str, str], @@ -61,11 +61,11 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): self._host = "https://api.bilibili.com" self.playwright_page = playwright_page self.cookie_dict = cookie_dict - # 初始化代理池(来自 ProxyRefreshMixin) + # Initialize proxy pool (from ProxyRefreshMixin) self.init_proxy_pool(proxy_ip_pool) async def request(self, method, url, **kwargs) -> Any: - # 每次请求前检测代理是否过期 + # Check if proxy has expired before each request await self._refresh_proxy_if_expired() async with httpx.AsyncClient(proxy=self.proxy) as client: @@ -82,8 +82,8 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def pre_request_data(self, req_data: Dict) -> Dict: """ - 发送请求进行请求参数签名 - 需要从 localStorage 拿 wbi_img_urls 这参数,值如下: + Send request to sign request parameters + Need to get wbi_img_urls parameter from localStorage, value as follows: https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png :param req_data: :return: @@ -95,7 +95,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_wbi_keys(self) -> Tuple[str, str]: """ - 获取最新的 img_key 和 sub_key + Get the latest img_key and sub_key :return: """ local_storage = await self.playwright_page.evaluate("() => window.localStorage") @@ -160,12 +160,12 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): ) -> Dict: """ KuaiShou web search api - :param keyword: 搜索关键词 - :param page: 分页参数具体第几页 - :param page_size: 每一页参数的数量 - :param order: 搜索结果排序,默认位综合排序 - :param pubtime_begin_s: 发布时间开始时间戳 - :param pubtime_end_s: 发布时间结束时间戳 + :param keyword: Search keyword + :param page: Page number for pagination + :param page_size: Number of items per page + :param order: Sort order for search results, default is comprehensive sorting + :param pubtime_begin_s: Publish time start timestamp + :param pubtime_end_s: Publish time end timestamp :return: """ uri = "/x/web-interface/wbi/search/type" @@ -182,13 +182,13 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_video_info(self, aid: Union[int, None] = None, bvid: Union[str, None] = None) -> Dict: """ - Bilibli web video detail api, aid 和 bvid任选一个参数 - :param aid: 稿件avid - :param bvid: 稿件bvid + Bilibli web video detail api, choose one parameter between aid and bvid + :param aid: Video aid + :param bvid: Video bvid :return: """ if not aid and not bvid: - raise ValueError("请提供 aid 或 bvid 中的至少一个参数") + raise ValueError("Please provide at least one parameter: aid or bvid") uri = "/x/web-interface/view/detail" params = dict() @@ -201,12 +201,12 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_video_play_url(self, aid: int, cid: int) -> Dict: """ Bilibli web video play url api - :param aid: 稿件avid + :param aid: Video aid :param cid: cid :return: """ if not aid or not cid or aid <= 0 or cid <= 0: - raise ValueError("aid 和 cid 必须存在") + raise ValueError("aid and cid must exist") uri = "/x/player/wbi/playurl" qn_value = getattr(config, "BILI_QN", 80) params = { @@ -233,7 +233,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): ) return None except httpx.HTTPError as exc: # some wrong when call httpx.request method, such as connection error, client error, server error or response status code is not 2xx - utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # 保留原始异常类型名称,以便开发者调试 + utils.logger.error(f"[BilibiliClient.get_video_media] {exc.__class__.__name__} for {exc.request.url} - {exc}") # Keep original exception type name for developer debugging return None async def get_video_comments( @@ -243,9 +243,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): next: int = 0, ) -> Dict: """get video comments - :param video_id: 视频 ID - :param order_mode: 排序方式 - :param next: 评论页选择 + :param video_id: Video ID + :param order_mode: Sort order + :param next: Comment page selection :return: """ uri = "/x/v2/reply/wbi/main" @@ -266,7 +266,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): :param crawl_interval: :param is_fetch_sub_comments: :param callback: - max_count: 一次笔记爬取的最大评论数量 + max_count: Maximum number of comments to crawl per note :return: """ @@ -299,7 +299,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): comment_list: List[Dict] = comments_res.get("replies", []) - # 检查 is_end 和 next 是否存在 + # Check if is_end and next exist if "is_end" not in cursor_info or "next" not in cursor_info: utils.logger.warning(f"[BilibiliClient.get_video_all_comments] 'is_end' or 'next' not in cursor for video_id: {video_id}. Assuming end of comments.") is_end = True @@ -317,7 +317,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): {await self.get_video_all_level_two_comments(video_id, comment_id, CommentOrderType.DEFAULT, 10, crawl_interval, callback)} if len(result) + len(comment_list) > max_count: comment_list = comment_list[:max_count - len(result)] - if callback: # 如果有回调函数,就执行回调函数 + if callback: # If there is a callback function, execute it await callback(video_id, comment_list) await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: @@ -336,10 +336,10 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): ) -> Dict: """ get video all level two comments for a level one comment - :param video_id: 视频 ID - :param level_one_comment_id: 一级评论 ID + :param video_id: Video ID + :param level_one_comment_id: Level one comment ID :param order_mode: - :param ps: 一页评论数 + :param ps: Number of comments per page :param crawl_interval: :param callback: :return: @@ -349,7 +349,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): while True: result = await self.get_video_level_two_comments(video_id, level_one_comment_id, pn, ps, order_mode) comment_list: List[Dict] = result.get("replies", []) - if callback: # 如果有回调函数,就执行回调函数 + if callback: # If there is a callback function, execute it await callback(video_id, comment_list) await asyncio.sleep(crawl_interval) if (int(result["page"]["count"]) <= pn * ps): @@ -366,9 +366,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): order_mode: CommentOrderType, ) -> Dict: """get video level two comments - :param video_id: 视频 ID - :param level_one_comment_id: 一级评论 ID - :param order_mode: 排序方式 + :param video_id: Video ID + :param level_one_comment_id: Level one comment ID + :param order_mode: Sort order :return: """ @@ -386,10 +386,10 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_creator_videos(self, creator_id: str, pn: int, ps: int = 30, order_mode: SearchOrderType = SearchOrderType.LAST_PUBLISH) -> Dict: """get all videos for a creator - :param creator_id: 创作者 ID - :param pn: 页数 - :param ps: 一页视频数 - :param order_mode: 排序方式 + :param creator_id: Creator ID + :param pn: Page number + :param ps: Number of videos per page + :param order_mode: Sort order :return: """ @@ -405,7 +405,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_creator_info(self, creator_id: int) -> Dict: """ get creator info - :param creator_id: 作者 ID + :param creator_id: Creator ID """ uri = "/x/space/wbi/acc/info" post_data = { @@ -421,9 +421,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): ) -> Dict: """ get creator fans - :param creator_id: 创作者 ID - :param pn: 开始页数 - :param ps: 每页数量 + :param creator_id: Creator ID + :param pn: Start page number + :param ps: Number of items per page :return: """ uri = "/x/relation/fans" @@ -443,9 +443,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): ) -> Dict: """ get creator followings - :param creator_id: 创作者 ID - :param pn: 开始页数 - :param ps: 每页数量 + :param creator_id: Creator ID + :param pn: Start page number + :param ps: Number of items per page :return: """ uri = "/x/relation/followings" @@ -460,8 +460,8 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): async def get_creator_dynamics(self, creator_id: int, offset: str = ""): """ get creator comments - :param creator_id: 创作者 ID - :param offset: 发送请求所需参数 + :param creator_id: Creator ID + :param offset: Parameter required for sending request :return: """ uri = "/x/polymer/web-dynamic/v1/feed/space" @@ -485,9 +485,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): :param creator_info: :param crawl_interval: :param callback: - :param max_count: 一个up主爬取的最大粉丝数量 + :param max_count: Maximum number of fans to crawl for a creator - :return: up主粉丝数列表 + :return: List of creator fans """ creator_id = creator_info["id"] result = [] @@ -499,7 +499,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): pn += 1 if len(result) + len(fans_list) > max_count: fans_list = fans_list[:max_count - len(result)] - if callback: # 如果有回调函数,就执行回调函数 + if callback: # If there is a callback function, execute it await callback(creator_info, fans_list) await asyncio.sleep(crawl_interval) if not fans_list: @@ -519,9 +519,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): :param creator_info: :param crawl_interval: :param callback: - :param max_count: 一个up主爬取的最大关注者数量 + :param max_count: Maximum number of followings to crawl for a creator - :return: up主关注者列表 + :return: List of creator followings """ creator_id = creator_info["id"] result = [] @@ -533,7 +533,7 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): pn += 1 if len(result) + len(followings_list) > max_count: followings_list = followings_list[:max_count - len(result)] - if callback: # 如果有回调函数,就执行回调函数 + if callback: # If there is a callback function, execute it await callback(creator_info, followings_list) await asyncio.sleep(crawl_interval) if not followings_list: @@ -553,9 +553,9 @@ class BilibiliClient(AbstractApiClient, ProxyRefreshMixin): :param creator_info: :param crawl_interval: :param callback: - :param max_count: 一个up主爬取的最大动态数量 + :param max_count: Maximum number of dynamics to crawl for a creator - :return: up主关注者列表 + :return: List of creator dynamics """ creator_id = creator_info["id"] result = [] diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 17e63f4..6933eab 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -20,7 +20,7 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 18:44 -# @Desc : B站爬虫 +# @Desc : Bilibili Crawler import asyncio import os @@ -64,7 +64,7 @@ class BilibiliCrawler(AbstractCrawler): self.index_url = "https://www.bilibili.com" self.user_agent = utils.get_user_agent() self.cdp_manager = None - self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新 + self.ip_proxy_pool = None # Proxy IP pool for automatic proxy refresh async def start(self): playwright_proxy_format, httpx_proxy_format = None, None @@ -74,9 +74,9 @@ class BilibiliCrawler(AbstractCrawler): playwright_proxy_format, httpx_proxy_format = utils.format_proxy_info(ip_proxy_info) async with async_playwright() as playwright: - # 根据配置选择启动模式 + # Choose launch mode based on configuration if config.ENABLE_CDP_MODE: - utils.logger.info("[BilibiliCrawler] 使用CDP模式启动浏览器") + utils.logger.info("[BilibiliCrawler] Launching browser using CDP mode") self.browser_context = await self.launch_browser_with_cdp( playwright, playwright_proxy_format, @@ -84,7 +84,7 @@ class BilibiliCrawler(AbstractCrawler): headless=config.CDP_HEADLESS, ) else: - utils.logger.info("[BilibiliCrawler] 使用标准模式启动浏览器") + utils.logger.info("[BilibiliCrawler] Launching browser using standard mode") # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser(chromium, None, self.user_agent, headless=config.HEADLESS) @@ -149,31 +149,31 @@ class BilibiliCrawler(AbstractCrawler): end: str = config.END_DAY, ) -> Tuple[str, str]: """ - 获取 bilibili 作品发布日期起始时间戳 pubtime_begin_s 与发布日期结束时间戳 pubtime_end_s + Get bilibili publish start timestamp pubtime_begin_s and publish end timestamp pubtime_end_s --- - :param start: 发布日期起始时间,YYYY-MM-DD - :param end: 发布日期结束时间,YYYY-MM-DD + :param start: Publish date start time, YYYY-MM-DD + :param end: Publish date end time, YYYY-MM-DD Note --- - - 搜索的时间范围为 start 至 end,包含 start 和 end - - 若要搜索同一天的内容,为了包含 start 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_begin_s 的值加上一天再减去一秒,即 start 当天的最后一秒 - - 如仅搜索 2024-01-05 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704470399 - 转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59) - - 若要搜索 start 至 end 的内容,为了包含 end 当天的搜索内容,则 pubtime_end_s 的值应该为 pubtime_end_s 的值加上一天再减去一秒,即 end 当天的最后一秒 - - 如搜索 2024-01-05 - 2024-01-06 的内容,pubtime_begin_s = 1704384000,pubtime_end_s = 1704556799 - 转换为可读的 datetime 对象:pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0),pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59) + - Search time range is from start to end, including both start and end + - To search content from the same day, to include search content from that day, pubtime_end_s should be pubtime_begin_s plus one day minus one second, i.e., the last second of start day + - For example, searching only 2024-01-05 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704470399 + Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 5, 23, 59, 59) + - To search content from start to end, to include search content from end day, pubtime_end_s should be pubtime_end_s plus one day minus one second, i.e., the last second of end day + - For example, searching 2024-01-05 - 2024-01-06 content, pubtime_begin_s = 1704384000, pubtime_end_s = 1704556799 + Converted to readable datetime objects: pubtime_begin_s = datetime.datetime(2024, 1, 5, 0, 0), pubtime_end_s = datetime.datetime(2024, 1, 6, 23, 59, 59) """ - # 转换 start 与 end 为 datetime 对象 + # Convert start and end to datetime objects start_day: datetime = datetime.strptime(start, "%Y-%m-%d") end_day: datetime = datetime.strptime(end, "%Y-%m-%d") if start_day > end_day: raise ValueError("Wrong time range, please check your start and end argument, to ensure that the start cannot exceed end") - elif start_day == end_day: # 搜索同一天的内容 - end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 start_day + 1 day - 1 second - else: # 搜索 start 至 end - end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # 则将 end_day 设置为 end_day + 1 day - 1 second - # 将其重新转换为时间戳 + elif start_day == end_day: # Searching content from the same day + end_day = (start_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to start_day + 1 day - 1 second + else: # Searching from start to end + end_day = (end_day + timedelta(days=1) - timedelta(seconds=1)) # Set end_day to end_day + 1 day - 1 second + # Convert back to timestamps return str(int(start_day.timestamp())), str(int(end_day.timestamp())) async def search_by_keywords(self): @@ -203,8 +203,8 @@ class BilibiliCrawler(AbstractCrawler): page=page, page_size=bili_limit_count, order=SearchOrderType.DEFAULT, - pubtime_begin_s=0, # 作品发布日期起始时间戳 - pubtime_end_s=0, # 作品发布日期结束日期时间戳 + pubtime_begin_s=0, # Publish date start timestamp + pubtime_end_s=0, # Publish date end timestamp ) video_list: List[Dict] = videos_res.get("result") @@ -508,7 +508,7 @@ class BilibiliCrawler(AbstractCrawler): "height": 1080 }, user_agent=user_agent, - channel="chrome", # 使用系统的Chrome稳定版 + channel="chrome", # Use system's stable Chrome version ) return browser_context else: @@ -525,7 +525,7 @@ class BilibiliCrawler(AbstractCrawler): headless: bool = True, ) -> BrowserContext: """ - 使用CDP模式启动浏览器 + Launch browser using CDP mode """ try: self.cdp_manager = CDPBrowserManager() @@ -536,22 +536,22 @@ class BilibiliCrawler(AbstractCrawler): headless=headless, ) - # 显示浏览器信息 + # Display browser information browser_info = await self.cdp_manager.get_browser_info() - utils.logger.info(f"[BilibiliCrawler] CDP浏览器信息: {browser_info}") + utils.logger.info(f"[BilibiliCrawler] CDP browser info: {browser_info}") return browser_context except Exception as e: - utils.logger.error(f"[BilibiliCrawler] CDP模式启动失败,回退到标准模式: {e}") - # 回退到标准模式 + utils.logger.error(f"[BilibiliCrawler] CDP mode launch failed, fallback to standard mode: {e}") + # Fallback to standard mode chromium = playwright.chromium return await self.launch_browser(chromium, playwright_proxy, user_agent, headless) async def close(self): """Close browser context""" try: - # 如果使用CDP模式,需要特殊处理 + # If using CDP mode, special handling is required if self.cdp_manager: await self.cdp_manager.cleanup() self.cdp_manager = None diff --git a/media_platform/bilibili/field.py b/media_platform/bilibili/field.py index 1bb6853..4a4ec75 100644 --- a/media_platform/bilibili/field.py +++ b/media_platform/bilibili/field.py @@ -27,28 +27,28 @@ from enum import Enum class SearchOrderType(Enum): - # 综合排序 + # Comprehensive sorting DEFAULT = "" - # 最多点击 + # Most clicks MOST_CLICK = "click" - # 最新发布 + # Latest published LAST_PUBLISH = "pubdate" - # 最多弹幕 + # Most danmu (comments) MOST_DANMU = "dm" - # 最多收藏 + # Most bookmarks MOST_MARK = "stow" class CommentOrderType(Enum): - # 仅按热度 + # By popularity only DEFAULT = 0 - # 按热度+按时间 + # By popularity + time MIXED = 1 - # 按时间 + # By time TIME = 2 diff --git a/media_platform/bilibili/help.py b/media_platform/bilibili/help.py index 70a4aa1..74fdfa2 100644 --- a/media_platform/bilibili/help.py +++ b/media_platform/bilibili/help.py @@ -21,8 +21,8 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 23:26 -# @Desc : bilibili 请求参数签名 -# 逆向实现参考:https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95 +# @Desc : bilibili request parameter signing +# Reverse engineering implementation reference: https://socialsisteryi.github.io/bilibili-API-collect/docs/misc/sign/wbi.html#wbi%E7%AD%BE%E5%90%8D%E7%AE%97%E6%B3%95 import re import urllib.parse from hashlib import md5 @@ -45,7 +45,7 @@ class BilibiliSign: def get_salt(self) -> str: """ - 获取加盐的 key + Get the salted key :return: """ salt = "" @@ -56,8 +56,8 @@ class BilibiliSign: def sign(self, req_data: Dict) -> Dict: """ - 请求参数中加上当前时间戳对请求参数中的key进行字典序排序 - 再将请求参数进行 url 编码集合 salt 进行 md5 就可以生成w_rid参数了 + Add current timestamp to request parameters, sort keys in dictionary order, + then URL encode the parameters and combine with salt to generate md5 for w_rid parameter :param req_data: :return: """ @@ -65,35 +65,35 @@ class BilibiliSign: req_data.update({"wts": current_ts}) req_data = dict(sorted(req_data.items())) req_data = { - # 过滤 value 中的 "!'()*" 字符 + # Filter "!'()*" characters from values k: ''.join(filter(lambda ch: ch not in "!'()*", str(v))) for k, v in req_data.items() } query = urllib.parse.urlencode(req_data) salt = self.get_salt() - wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid + wbi_sign = md5((query + salt).encode()).hexdigest() # Calculate w_rid req_data['w_rid'] = wbi_sign return req_data def parse_video_info_from_url(url: str) -> VideoUrlInfo: """ - 从B站视频URL中解析出视频ID + Parse video ID from Bilibili video URL Args: - url: B站视频链接 + url: Bilibili video link - https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click - https://www.bilibili.com/video/BV1d54y1g7db - - BV1d54y1g7db (直接传入BV号) + - BV1d54y1g7db (directly pass BV number) Returns: - VideoUrlInfo: 包含视频ID的对象 + VideoUrlInfo: Object containing video ID """ - # 如果传入的已经是BV号,直接返回 + # If the input is already a BV number, return directly if url.startswith("BV"): return VideoUrlInfo(video_id=url) - # 使用正则表达式提取BV号 - # 匹配 /video/BV... 或 /video/av... 格式 + # Use regex to extract BV number + # Match /video/BV... or /video/av... format bv_pattern = r'/video/(BV[a-zA-Z0-9]+)' match = re.search(bv_pattern, url) @@ -101,26 +101,26 @@ def parse_video_info_from_url(url: str) -> VideoUrlInfo: video_id = match.group(1) return VideoUrlInfo(video_id=video_id) - raise ValueError(f"无法从URL中解析出视频ID: {url}") + raise ValueError(f"Unable to parse video ID from URL: {url}") def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: """ - 从B站创作者空间URL中解析出创作者ID + Parse creator ID from Bilibili creator space URL Args: - url: B站创作者空间链接 + url: Bilibili creator space link - https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0 - https://space.bilibili.com/20813884 - - 434377496 (直接传入UID) + - 434377496 (directly pass UID) Returns: - CreatorUrlInfo: 包含创作者ID的对象 + CreatorUrlInfo: Object containing creator ID """ - # 如果传入的已经是纯数字ID,直接返回 + # If the input is already a numeric ID, return directly if url.isdigit(): return CreatorUrlInfo(creator_id=url) - # 使用正则表达式提取UID - # 匹配 /space.bilibili.com/数字 格式 + # Use regex to extract UID + # Match /space.bilibili.com/number format uid_pattern = r'space\.bilibili\.com/(\d+)' match = re.search(uid_pattern, url) @@ -128,20 +128,20 @@ def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: creator_id = match.group(1) return CreatorUrlInfo(creator_id=creator_id) - raise ValueError(f"无法从URL中解析出创作者ID: {url}") + raise ValueError(f"Unable to parse creator ID from URL: {url}") if __name__ == '__main__': - # 测试视频URL解析 + # Test video URL parsing video_url1 = "https://www.bilibili.com/video/BV1dwuKzmE26/?spm_id_from=333.1387.homepage.video_card.click" video_url2 = "BV1d54y1g7db" - print("视频URL解析测试:") + print("Video URL parsing test:") print(f"URL1: {video_url1} -> {parse_video_info_from_url(video_url1)}") print(f"URL2: {video_url2} -> {parse_video_info_from_url(video_url2)}") - # 测试创作者URL解析 + # Test creator URL parsing creator_url1 = "https://space.bilibili.com/434377496?spm_id_from=333.1007.0.0" creator_url2 = "20813884" - print("\n创作者URL解析测试:") + print("\nCreator URL parsing test:") print(f"URL1: {creator_url1} -> {parse_creator_info_from_url(creator_url1)}") print(f"URL2: {creator_url2} -> {parse_creator_info_from_url(creator_url2)}") diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py index c9f9a89..bd50fe2 100644 --- a/media_platform/bilibili/login.py +++ b/media_platform/bilibili/login.py @@ -21,7 +21,7 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 18:44 -# @Desc : bilibli登录实现类 +# @Desc : bilibili login implementation class import asyncio import functools diff --git a/media_platform/douyin/field.py b/media_platform/douyin/field.py index 2103b5a..914c00f 100644 --- a/media_platform/douyin/field.py +++ b/media_platform/douyin/field.py @@ -23,21 +23,21 @@ from enum import Enum class SearchChannelType(Enum): """search channel type""" - GENERAL = "aweme_general" # 综合 - VIDEO = "aweme_video_web" # 视频 - USER = "aweme_user_web" # 用户 - LIVE = "aweme_live" # 直播 + GENERAL = "aweme_general" # General + VIDEO = "aweme_video_web" # Video + USER = "aweme_user_web" # User + LIVE = "aweme_live" # Live class SearchSortType(Enum): """search sort type""" - GENERAL = 0 # 综合排序 - MOST_LIKE = 1 # 最多点赞 - LATEST = 2 # 最新发布 + GENERAL = 0 # Comprehensive sorting + MOST_LIKE = 1 # Most likes + LATEST = 2 # Latest published class PublishTimeType(Enum): """publish time type""" - UNLIMITED = 0 # 不限 - ONE_DAY = 1 # 一天内 - ONE_WEEK = 7 # 一周内 - SIX_MONTH = 180 # 半年内 + UNLIMITED = 0 # Unlimited + ONE_DAY = 1 # Within one day + ONE_WEEK = 7 # Within one week + SIX_MONTH = 180 # Within six months diff --git a/media_platform/douyin/help.py b/media_platform/douyin/help.py index 9d26917..184b681 100644 --- a/media_platform/douyin/help.py +++ b/media_platform/douyin/help.py @@ -22,7 +22,7 @@ # @Author : relakkes@gmail.com # @Name : 程序员阿江-Relakkes # @Time : 2024/6/10 02:24 -# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除 +# @Desc : Get a_bogus parameter, for learning and communication only, do not use for commercial purposes, contact author to delete if infringement import random import re @@ -38,7 +38,7 @@ douyin_sign_obj = execjs.compile(open('libs/douyin.js', encoding='utf-8-sig').re def get_web_id(): """ - 生成随机的webid + Generate random webid Returns: """ @@ -60,13 +60,13 @@ def get_web_id(): async def get_a_bogus(url: str, params: str, post_data: dict, user_agent: str, page: Page = None): """ - 获取 a_bogus 参数, 目前不支持post请求类型的签名 + Get a_bogus parameter, currently does not support POST request type signature """ return get_a_bogus_from_js(url, params, user_agent) def get_a_bogus_from_js(url: str, params: str, user_agent: str): """ - 通过js获取 a_bogus 参数 + Get a_bogus parameter through js Args: url: params: @@ -84,8 +84,8 @@ def get_a_bogus_from_js(url: str, params: str, user_agent: str): async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page): """ - 通过playright获取 a_bogus 参数 - playwright版本已失效 + Get a_bogus parameter through playwright + playwright version is deprecated Returns: """ @@ -100,73 +100,73 @@ async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: s def parse_video_info_from_url(url: str) -> VideoUrlInfo: """ - 从抖音视频URL中解析出视频ID - 支持以下格式: - 1. 普通视频链接: https://www.douyin.com/video/7525082444551310602 - 2. 带modal_id参数的链接: + Parse video ID from Douyin video URL + Supports the following formats: + 1. Normal video link: https://www.douyin.com/video/7525082444551310602 + 2. Link with modal_id parameter: - https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?modal_id=7525082444551310602 - https://www.douyin.com/root/search/python?modal_id=7471165520058862848 - 3. 短链接: https://v.douyin.com/iF12345ABC/ (需要client解析) - 4. 纯ID: 7525082444551310602 + 3. Short link: https://v.douyin.com/iF12345ABC/ (requires client parsing) + 4. Pure ID: 7525082444551310602 Args: - url: 抖音视频链接或ID + url: Douyin video link or ID Returns: - VideoUrlInfo: 包含视频ID的对象 + VideoUrlInfo: Object containing video ID """ - # 如果是纯数字ID,直接返回 + # If it's a pure numeric ID, return directly if url.isdigit(): return VideoUrlInfo(aweme_id=url, url_type="normal") - # 检查是否是短链接 (v.douyin.com) + # Check if it's a short link (v.douyin.com) if "v.douyin.com" in url or url.startswith("http") and len(url) < 50 and "video" not in url: - return VideoUrlInfo(aweme_id="", url_type="short") # 需要通过client解析 + return VideoUrlInfo(aweme_id="", url_type="short") # Requires client parsing - # 尝试从URL参数中提取modal_id + # Try to extract modal_id from URL parameters params = extract_url_params_to_dict(url) modal_id = params.get("modal_id") if modal_id: return VideoUrlInfo(aweme_id=modal_id, url_type="modal") - # 从标准视频URL中提取ID: /video/数字 + # Extract ID from standard video URL: /video/number video_pattern = r'/video/(\d+)' match = re.search(video_pattern, url) if match: aweme_id = match.group(1) return VideoUrlInfo(aweme_id=aweme_id, url_type="normal") - raise ValueError(f"无法从URL中解析出视频ID: {url}") + raise ValueError(f"Unable to parse video ID from URL: {url}") def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: """ - 从抖音创作者主页URL中解析出创作者ID (sec_user_id) - 支持以下格式: - 1. 创作者主页: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main - 2. 纯ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE + Parse creator ID (sec_user_id) from Douyin creator homepage URL + Supports the following formats: + 1. Creator homepage: https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main + 2. Pure ID: MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE Args: - url: 抖音创作者主页链接或sec_user_id + url: Douyin creator homepage link or sec_user_id Returns: - CreatorUrlInfo: 包含创作者ID的对象 + CreatorUrlInfo: Object containing creator ID """ - # 如果是纯ID格式(通常以MS4wLjABAAAA开头),直接返回 + # If it's a pure ID format (usually starts with MS4wLjABAAAA), return directly if url.startswith("MS4wLjABAAAA") or (not url.startswith("http") and "douyin.com" not in url): return CreatorUrlInfo(sec_user_id=url) - # 从创作者主页URL中提取sec_user_id: /user/xxx + # Extract sec_user_id from creator homepage URL: /user/xxx user_pattern = r'/user/([^/?]+)' match = re.search(user_pattern, url) if match: sec_user_id = match.group(1) return CreatorUrlInfo(sec_user_id=sec_user_id) - raise ValueError(f"无法从URL中解析出创作者ID: {url}") + raise ValueError(f"Unable to parse creator ID from URL: {url}") if __name__ == '__main__': - # 测试视频URL解析 - print("=== 视频URL解析测试 ===") + # Test video URL parsing + print("=== Video URL Parsing Test ===") test_urls = [ "https://www.douyin.com/video/7525082444551310602", "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main&modal_id=7525082444551310602", @@ -177,13 +177,13 @@ if __name__ == '__main__': try: result = parse_video_info_from_url(url) print(f"✓ URL: {url[:80]}...") - print(f" 结果: {result}\n") + print(f" Result: {result}\n") except Exception as e: print(f"✗ URL: {url}") - print(f" 错误: {e}\n") + print(f" Error: {e}\n") - # 测试创作者URL解析 - print("=== 创作者URL解析测试 ===") + # Test creator URL parsing + print("=== Creator URL Parsing Test ===") test_creator_urls = [ "https://www.douyin.com/user/MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE?from_tab_name=main", "MS4wLjABAAAATJPY7LAlaa5X-c8uNdWkvz0jUGgpw4eeXIwu_8BhvqE", @@ -192,7 +192,7 @@ if __name__ == '__main__': try: result = parse_creator_info_from_url(url) print(f"✓ URL: {url[:80]}...") - print(f" 结果: {result}\n") + print(f" Result: {result}\n") except Exception as e: print(f"✗ URL: {url}") - print(f" 错误: {e}\n") + print(f" Error: {e}\n") diff --git a/media_platform/douyin/login.py b/media_platform/douyin/login.py index b9076de..ea78e34 100644 --- a/media_platform/douyin/login.py +++ b/media_platform/douyin/login.py @@ -53,7 +53,7 @@ class DouYinLogin(AbstractLogin): async def begin(self): """ Start login douyin website - 滑块中间页面的验证准确率不太OK... 如果没有特俗要求,建议不开抖音登录,或者使用cookies登录 + The verification accuracy of the slider verification is not very good... If there are no special requirements, it is recommended not to use Douyin login, or use cookie login """ # popup login dialog @@ -69,7 +69,7 @@ class DouYinLogin(AbstractLogin): else: raise ValueError("[DouYinLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") - # 如果页面重定向到滑动验证码页面,需要再次滑动滑块 + # If the page redirects to the slider verification page, need to slide again await asyncio.sleep(6) current_page_title = await self.context_page.title() if "验证码中间页" in current_page_title: @@ -147,10 +147,10 @@ class DouYinLogin(AbstractLogin): send_sms_code_btn = self.context_page.locator("xpath=//span[text() = '获取验证码']") await send_sms_code_btn.click() - # 检查是否有滑动验证码 + # Check if there is slider verification await self.check_page_display_slider(move_step=10, slider_level="easy") cache_client = CacheFactory.create_cache(config.CACHE_TYPE_MEMORY) - max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 + max_get_sms_code_time = 60 * 2 # Maximum time to get verification code is 2 minutes while max_get_sms_code_time > 0: utils.logger.info(f"[DouYinLogin.login_by_mobile] get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") await asyncio.sleep(1) @@ -164,20 +164,20 @@ class DouYinLogin(AbstractLogin): await sms_code_input_ele.fill(value=sms_code_value.decode()) await asyncio.sleep(0.5) submit_btn_ele = self.context_page.locator("xpath=//button[@class='web-login-button']") - await submit_btn_ele.click() # 点击登录 - # todo ... 应该还需要检查验证码的正确性有可能输入的验证码不正确 + await submit_btn_ele.click() # Click login + # todo ... should also check the correctness of the verification code, it may be incorrect break async def check_page_display_slider(self, move_step: int = 10, slider_level: str = "easy"): """ - 检查页面是否出现滑动验证码 + Check if slider verification appears on the page :return: """ - # 等待滑动验证码的出现 + # Wait for slider verification to appear back_selector = "#captcha-verify-image" try: await self.context_page.wait_for_selector(selector=back_selector, state="visible", timeout=30 * 1000) - except PlaywrightTimeoutError: # 没有滑动验证码,直接返回 + except PlaywrightTimeoutError: # No slider verification, return directly return gap_selector = 'xpath=//*[@id="captcha_container"]/div/div[2]/img[2]' @@ -191,16 +191,16 @@ class DouYinLogin(AbstractLogin): await self.move_slider(back_selector, gap_selector, move_step, slider_level) await asyncio.sleep(1) - # 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮 + # If the slider is too slow or verification failed, it will prompt "操作过慢", click the refresh button here page_content = await self.context_page.content() if "操作过慢" in page_content or "提示重新操作" in page_content: utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify failed, retry ...") await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]") continue - # 滑动成功后,等待滑块消失 + # After successful sliding, wait for the slider to disappear await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000) - # 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码 + # If the slider disappears, it means the verification is successful, break the loop. If not, it means the verification failed, the above line will throw an exception and be caught to continue the loop utils.logger.info("[DouYinLogin.check_page_display_slider] slider verify success ...") slider_verify_success = True except Exception as e: @@ -213,10 +213,10 @@ class DouYinLogin(AbstractLogin): async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"): """ Move the slider to the right to complete the verification - :param back_selector: 滑动验证码背景图片的选择器 - :param gap_selector: 滑动验证码的滑块选择器 - :param move_step: 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢 - :param slider_level: 滑块难度 easy hard,分别对应手机验证码的滑块和验证码中间的滑块 + :param back_selector: Selector for the slider verification background image + :param gap_selector: Selector for the slider verification slider + :param move_step: Controls the ratio of single movement speed, default is 1, meaning the distance moves in 0.1 seconds no matter how far, larger value means slower + :param slider_level: Slider difficulty easy hard, corresponding to the slider for mobile verification code and the slider in the middle of verification code :return: """ @@ -234,31 +234,31 @@ class DouYinLogin(AbstractLogin): ) gap_src = str(await gap_elements.get_property("src")) # type: ignore - # 识别滑块位置 + # Identify slider position slide_app = utils.Slide(gap=gap_src, bg=slide_back) distance = slide_app.discern() - # 获取移动轨迹 + # Get movement trajectory tracks = utils.get_tracks(distance, slider_level) new_1 = tracks[-1] - (sum(tracks) - distance) tracks.pop() tracks.append(new_1) - # 根据轨迹拖拽滑块到指定位置 + # Drag slider to specified position according to trajectory element = await self.context_page.query_selector(gap_selector) bounding_box = await element.bounding_box() # type: ignore await self.context_page.mouse.move(bounding_box["x"] + bounding_box["width"] / 2, # type: ignore bounding_box["y"] + bounding_box["height"] / 2) # type: ignore - # 这里获取到x坐标中心点位置 + # Get x coordinate center position x = bounding_box["x"] + bounding_box["width"] / 2 # type: ignore - # 模拟滑动操作 + # Simulate sliding operation await element.hover() # type: ignore await self.context_page.mouse.down() for track in tracks: - # 循环鼠标按照轨迹移动 - # steps 是控制单次移动速度的比例是1/10 默认是1 相当于 传入的这个距离不管多远0.1秒钟移动完 越大越慢 + # Loop mouse movement according to trajectory + # steps controls the ratio of single movement speed, default is 1, meaning the distance moves in 0.1 seconds no matter how far, larger value means slower await self.context_page.mouse.move(x + track, 0, steps=move_step) x += track await self.context_page.mouse.up() diff --git a/media_platform/kuaishou/client.py b/media_platform/kuaishou/client.py index e402380..c54f400 100644 --- a/media_platform/kuaishou/client.py +++ b/media_platform/kuaishou/client.py @@ -57,11 +57,11 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): self.playwright_page = playwright_page self.cookie_dict = cookie_dict self.graphql = KuaiShouGraphQL() - # 初始化代理池(来自 ProxyRefreshMixin) + # Initialize proxy pool (from ProxyRefreshMixin) self.init_proxy_pool(proxy_ip_pool) async def request(self, method, url, **kwargs) -> Any: - # 每次请求前检测代理是否过期 + # Check if proxy is expired before each request await self._refresh_proxy_if_expired() async with httpx.AsyncClient(proxy=self.proxy) as client: @@ -222,7 +222,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): comments = vision_commen_list.get("rootComments", []) if len(result) + len(comments) > max_count: comments = comments[: max_count - len(result)] - if callback: # 如果有回调函数,就执行回调函数 + if callback: # If there is a callback function, execute the callback function await callback(photo_id, comments) result.extend(comments) await asyncio.sleep(crawl_interval) @@ -240,12 +240,12 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): callback: Optional[Callable] = None, ) -> List[Dict]: """ - 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 + Get all second-level comments under specified first-level comments, this method will continue to find all second-level comment information under first-level comments Args: - comments: 评论列表 - photo_id: 视频id - crawl_interval: 爬取一次评论的延迟单位(秒) - callback: 一次评论爬取结束后 + comments: Comment list + photo_id: Video ID + crawl_interval: Delay unit for crawling comments once (seconds) + callback: Callback after one comment crawl ends Returns: """ @@ -285,7 +285,7 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): async def get_creator_info(self, user_id: str) -> Dict: """ eg: https://www.kuaishou.com/profile/3x4jtnbfter525a - 快手用户主页 + Kuaishou user homepage """ visionProfile = await self.get_creator_profile(user_id) @@ -298,11 +298,11 @@ class KuaiShouClient(AbstractApiClient, ProxyRefreshMixin): callback: Optional[Callable] = None, ) -> List[Dict]: """ - 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 + Get all posts published by the specified user, this method will continue to find all post information under a user Args: - user_id: 用户ID - crawl_interval: 爬取一次的延迟单位(秒) - callback: 一次分页爬取结束后的更新回调函数 + user_id: User ID + crawl_interval: Delay unit for crawling once (seconds) + callback: Update callback function after one page crawl ends Returns: """ diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index b0655f4..a7c7c27 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -58,7 +58,7 @@ class KuaishouCrawler(AbstractCrawler): self.index_url = "https://www.kuaishou.com" self.user_agent = utils.get_user_agent() self.cdp_manager = None - self.ip_proxy_pool = None # 代理IP池,用于代理自动刷新 + self.ip_proxy_pool = None # Proxy IP pool, used for automatic proxy refresh async def start(self): playwright_proxy_format, httpx_proxy_format = None, None @@ -72,9 +72,9 @@ class KuaishouCrawler(AbstractCrawler): ) async with async_playwright() as playwright: - # 根据配置选择启动模式 + # Select startup mode based on configuration if config.ENABLE_CDP_MODE: - utils.logger.info("[KuaishouCrawler] 使用CDP模式启动浏览器") + utils.logger.info("[KuaishouCrawler] Launching browser using CDP mode") self.browser_context = await self.launch_browser_with_cdp( playwright, playwright_proxy_format, @@ -82,7 +82,7 @@ class KuaishouCrawler(AbstractCrawler): headless=config.CDP_HEADLESS, ) else: - utils.logger.info("[KuaishouCrawler] 使用标准模式启动浏览器") + utils.logger.info("[KuaishouCrawler] Launching browser using standard mode") # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser( @@ -318,7 +318,7 @@ class KuaishouCrawler(AbstractCrawler): }, playwright_page=self.context_page, cookie_dict=cookie_dict, - proxy_ip_pool=self.ip_proxy_pool, # 传递代理池用于自动刷新 + proxy_ip_pool=self.ip_proxy_pool, # Pass proxy pool for automatic refresh ) return ks_client_obj @@ -344,7 +344,7 @@ class KuaishouCrawler(AbstractCrawler): proxy=playwright_proxy, # type: ignore viewport={"width": 1920, "height": 1080}, user_agent=user_agent, - channel="chrome", # 使用系统的Chrome稳定版 + channel="chrome", # Use system's stable Chrome version ) return browser_context else: @@ -362,7 +362,7 @@ class KuaishouCrawler(AbstractCrawler): headless: bool = True, ) -> BrowserContext: """ - 使用CDP模式启动浏览器 + Launch browser using CDP mode """ try: self.cdp_manager = CDPBrowserManager() @@ -373,17 +373,17 @@ class KuaishouCrawler(AbstractCrawler): headless=headless, ) - # 显示浏览器信息 + # Display browser information browser_info = await self.cdp_manager.get_browser_info() - utils.logger.info(f"[KuaishouCrawler] CDP浏览器信息: {browser_info}") + utils.logger.info(f"[KuaishouCrawler] CDP browser info: {browser_info}") return browser_context except Exception as e: utils.logger.error( - f"[KuaishouCrawler] CDP模式启动失败,回退到标准模式: {e}" + f"[KuaishouCrawler] CDP mode launch failed, fallback to standard mode: {e}" ) - # 回退到标准模式 + # Fallback to standard mode chromium = playwright.chromium return await self.launch_browser( chromium, playwright_proxy, user_agent, headless @@ -438,7 +438,7 @@ class KuaishouCrawler(AbstractCrawler): async def close(self): """Close browser context""" - # 如果使用CDP模式,需要特殊处理 + # If using CDP mode, need special handling if self.cdp_manager: await self.cdp_manager.cleanup() self.cdp_manager = None diff --git a/media_platform/kuaishou/graphql.py b/media_platform/kuaishou/graphql.py index bf7af48..3f19eb7 100644 --- a/media_platform/kuaishou/graphql.py +++ b/media_platform/kuaishou/graphql.py @@ -18,8 +18,8 @@ # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 -# 快手的数据传输是基于GraphQL实现的 -# 这个类负责获取一些GraphQL的schema +# Kuaishou's data transmission is based on GraphQL +# This class is responsible for obtaining some GraphQL schemas from typing import Dict diff --git a/media_platform/kuaishou/help.py b/media_platform/kuaishou/help.py index 6cd4653..ea4ebc8 100644 --- a/media_platform/kuaishou/help.py +++ b/media_platform/kuaishou/help.py @@ -26,59 +26,59 @@ from model.m_kuaishou import VideoUrlInfo, CreatorUrlInfo def parse_video_info_from_url(url: str) -> VideoUrlInfo: """ - 从快手视频URL中解析出视频ID - 支持以下格式: - 1. 完整视频URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search" - 2. 纯视频ID: "3x3zxz4mjrsc8ke" + Parse video ID from Kuaishou video URL + Supports the following formats: + 1. Full video URL: "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search" + 2. Pure video ID: "3x3zxz4mjrsc8ke" Args: - url: 快手视频链接或视频ID + url: Kuaishou video link or video ID Returns: - VideoUrlInfo: 包含视频ID的对象 + VideoUrlInfo: Object containing video ID """ - # 如果不包含http且不包含kuaishou.com,认为是纯ID + # If it doesn't contain http and doesn't contain kuaishou.com, consider it as pure ID if not url.startswith("http") and "kuaishou.com" not in url: return VideoUrlInfo(video_id=url, url_type="normal") - # 从标准视频URL中提取ID: /short-video/视频ID + # Extract ID from standard video URL: /short-video/video_ID video_pattern = r'/short-video/([a-zA-Z0-9_-]+)' match = re.search(video_pattern, url) if match: video_id = match.group(1) return VideoUrlInfo(video_id=video_id, url_type="normal") - raise ValueError(f"无法从URL中解析出视频ID: {url}") + raise ValueError(f"Unable to parse video ID from URL: {url}") def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: """ - 从快手创作者主页URL中解析出创作者ID - 支持以下格式: - 1. 创作者主页: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs" - 2. 纯ID: "3x4sm73aye7jq7i" + Parse creator ID from Kuaishou creator homepage URL + Supports the following formats: + 1. Creator homepage: "https://www.kuaishou.com/profile/3x84qugg4ch9zhs" + 2. Pure ID: "3x4sm73aye7jq7i" Args: - url: 快手创作者主页链接或user_id + url: Kuaishou creator homepage link or user_id Returns: - CreatorUrlInfo: 包含创作者ID的对象 + CreatorUrlInfo: Object containing creator ID """ - # 如果不包含http且不包含kuaishou.com,认为是纯ID + # If it doesn't contain http and doesn't contain kuaishou.com, consider it as pure ID if not url.startswith("http") and "kuaishou.com" not in url: return CreatorUrlInfo(user_id=url) - # 从创作者主页URL中提取user_id: /profile/xxx + # Extract user_id from creator homepage URL: /profile/xxx user_pattern = r'/profile/([a-zA-Z0-9_-]+)' match = re.search(user_pattern, url) if match: user_id = match.group(1) return CreatorUrlInfo(user_id=user_id) - raise ValueError(f"无法从URL中解析出创作者ID: {url}") + raise ValueError(f"Unable to parse creator ID from URL: {url}") if __name__ == '__main__': - # 测试视频URL解析 - print("=== 视频URL解析测试 ===") + # Test video URL parsing + print("=== Video URL Parsing Test ===") test_video_urls = [ "https://www.kuaishou.com/short-video/3x3zxz4mjrsc8ke?authorId=3x84qugg4ch9zhs&streamSource=search&area=searchxxnull&searchKey=python", "3xf8enb8dbj6uig", @@ -87,13 +87,13 @@ if __name__ == '__main__': try: result = parse_video_info_from_url(url) print(f"✓ URL: {url[:80]}...") - print(f" 结果: {result}\n") + print(f" Result: {result}\n") except Exception as e: print(f"✗ URL: {url}") - print(f" 错误: {e}\n") + print(f" Error: {e}\n") - # 测试创作者URL解析 - print("=== 创作者URL解析测试 ===") + # Test creator URL parsing + print("=== Creator URL Parsing Test ===") test_creator_urls = [ "https://www.kuaishou.com/profile/3x84qugg4ch9zhs", "3x4sm73aye7jq7i", @@ -102,7 +102,7 @@ if __name__ == '__main__': try: result = parse_creator_info_from_url(url) print(f"✓ URL: {url[:80]}...") - print(f" 结果: {result}\n") + print(f" Result: {result}\n") except Exception as e: print(f"✗ URL: {url}") - print(f" 错误: {e}\n") + print(f" Error: {e}\n") diff --git a/media_platform/tieba/client.py b/media_platform/tieba/client.py index 11206c4..79cffb5 100644 --- a/media_platform/tieba/client.py +++ b/media_platform/tieba/client.py @@ -48,7 +48,7 @@ class BaiduTieBaClient(AbstractApiClient): ): self.ip_pool: Optional[ProxyIpPool] = ip_pool self.timeout = timeout - # 使用传入的headers(包含真实浏览器UA)或默认headers + # Use provided headers (including real browser UA) or default headers self.headers = headers or { "User-Agent": utils.get_user_agent(), "Cookie": "", @@ -56,21 +56,21 @@ class BaiduTieBaClient(AbstractApiClient): self._host = "https://tieba.baidu.com" self._page_extractor = TieBaExtractor() self.default_ip_proxy = default_ip_proxy - self.playwright_page = playwright_page # Playwright页面对象 + self.playwright_page = playwright_page # Playwright page object def _sync_request(self, method, url, proxy=None, **kwargs): """ - 同步的requests请求方法 + Synchronous requests method Args: - method: 请求方法 - url: 请求的URL - proxy: 代理IP - **kwargs: 其他请求参数 + method: Request method + url: Request URL + proxy: Proxy IP + **kwargs: Other request parameters Returns: - response对象 + Response object """ - # 构造代理字典 + # Construct proxy dictionary proxies = None if proxy: proxies = { @@ -78,7 +78,7 @@ class BaiduTieBaClient(AbstractApiClient): "https": proxy, } - # 发送请求 + # Send request response = requests.request( method=method, url=url, @@ -91,7 +91,7 @@ class BaiduTieBaClient(AbstractApiClient): async def _refresh_proxy_if_expired(self) -> None: """ - 检测代理是否过期,如果过期则自动刷新 + Check if proxy is expired and automatically refresh if necessary """ if self.ip_pool is None: return @@ -101,7 +101,7 @@ class BaiduTieBaClient(AbstractApiClient): "[BaiduTieBaClient._refresh_proxy_if_expired] Proxy expired, refreshing..." ) new_proxy = await self.ip_pool.get_or_refresh_proxy() - # 更新代理URL + # Update proxy URL _, self.default_ip_proxy = utils.format_proxy_info(new_proxy) utils.logger.info( f"[BaiduTieBaClient._refresh_proxy_if_expired] New proxy: {new_proxy.ip}:{new_proxy.port}" @@ -110,23 +110,23 @@ class BaiduTieBaClient(AbstractApiClient): @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]: """ - 封装requests的公共请求方法,对请求响应做一些处理 + Common request method wrapper for requests, handles request responses Args: - method: 请求方法 - url: 请求的URL - return_ori_content: 是否返回原始内容 - proxy: 代理IP - **kwargs: 其他请求参数,例如请求头、请求体等 + method: Request method + url: Request URL + return_ori_content: Whether to return original content + proxy: Proxy IP + **kwargs: Other request parameters, such as headers, request body, etc. Returns: """ - # 每次请求前检测代理是否过期 + # Check if proxy is expired before each request await self._refresh_proxy_if_expired() actual_proxy = proxy if proxy else self.default_ip_proxy - # 在线程池中执行同步的requests请求 + # Execute synchronous requests in thread pool response = await asyncio.to_thread( self._sync_request, method, @@ -151,11 +151,11 @@ class BaiduTieBaClient(AbstractApiClient): async def get(self, uri: str, params=None, return_ori_content=False, **kwargs) -> Any: """ - GET请求,对请求头签名 + GET request with header signing Args: - uri: 请求路由 - params: 请求参数 - return_ori_content: 是否返回原始内容 + uri: Request route + params: Request parameters + return_ori_content: Whether to return original content Returns: @@ -175,15 +175,15 @@ class BaiduTieBaClient(AbstractApiClient): self.default_ip_proxy = proxy return res - utils.logger.error(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") - raise Exception(f"[BaiduTieBaClient.get] 达到了最大重试次数,IP已经被Block,请尝试更换新的IP代理: {e}") + utils.logger.error(f"[BaiduTieBaClient.get] Reached maximum retry attempts, IP is blocked, please try a new IP proxy: {e}") + raise Exception(f"[BaiduTieBaClient.get] Reached maximum retry attempts, IP is blocked, please try a new IP proxy: {e}") async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ - POST请求,对请求头签名 + POST request with header signing Args: - uri: 请求路由 - data: 请求体参数 + uri: Request route + data: Request body parameters Returns: @@ -193,13 +193,13 @@ class BaiduTieBaClient(AbstractApiClient): async def pong(self, browser_context: BrowserContext = None) -> bool: """ - 用于检查登录态是否失效了 - 使用Cookie检测而非API调用,避免被检测 + Check if login state is still valid + Uses Cookie detection instead of API calls to avoid detection Args: - browser_context: 浏览器上下文对象 + browser_context: Browser context object Returns: - bool: True表示已登录,False表示未登录 + bool: True if logged in, False if not logged in """ utils.logger.info("[BaiduTieBaClient.pong] Begin to check tieba login state by cookies...") @@ -208,13 +208,13 @@ class BaiduTieBaClient(AbstractApiClient): return False try: - # 从浏览器获取cookies并检查关键登录cookie + # Get cookies from browser and check key login cookies _, cookie_dict = utils.convert_cookies(await browser_context.cookies()) - # 百度贴吧的登录标识: STOKEN 或 PTOKEN + # Baidu Tieba login identifiers: STOKEN or PTOKEN stoken = cookie_dict.get("STOKEN") ptoken = cookie_dict.get("PTOKEN") - bduss = cookie_dict.get("BDUSS") # 百度通用登录cookie + bduss = cookie_dict.get("BDUSS") # Baidu universal login cookie if stoken or ptoken or bduss: utils.logger.info(f"[BaiduTieBaClient.pong] Login state verified by cookies (STOKEN: {bool(stoken)}, PTOKEN: {bool(ptoken)}, BDUSS: {bool(bduss)})") @@ -229,9 +229,9 @@ class BaiduTieBaClient(AbstractApiClient): async def update_cookies(self, browser_context: BrowserContext): """ - API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 + Update cookies method provided by API client, usually called after successful login Args: - browser_context: 浏览器上下文对象 + browser_context: Browser context object Returns: @@ -249,13 +249,13 @@ class BaiduTieBaClient(AbstractApiClient): note_type: SearchNoteType = SearchNoteType.FIXED_THREAD, ) -> List[TiebaNote]: """ - 根据关键词搜索贴吧帖子 (使用Playwright访问页面,避免API检测) + Search Tieba posts by keyword (uses Playwright to access page, avoiding API detection) Args: - keyword: 关键词 - page: 分页第几页 - page_size: 每页大小 - sort: 结果排序方式 - note_type: 帖子类型(主题贴|主题+回复混合模式) + keyword: Keyword + page: Page number + page_size: Page size + sort: Result sort method + note_type: Post type (main thread | main thread + reply mixed mode) Returns: """ @@ -263,8 +263,8 @@ class BaiduTieBaClient(AbstractApiClient): utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode") raise Exception("playwright_page is required for browser-based search") - # 构造搜索URL - # 示例: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=编程 + # Construct search URL + # Example: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=keyword search_url = f"{self._host}/f/search/res" params = { "ie": "utf-8", @@ -275,64 +275,64 @@ class BaiduTieBaClient(AbstractApiClient): "only_thread": note_type.value, } - # 拼接完整URL + # Concatenate full URL full_url = f"{search_url}?{urlencode(params)}" - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 访问搜索页面: {full_url}") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Accessing search page: {full_url}") try: - # 使用Playwright访问搜索页面 + # Use Playwright to access search page await self.playwright_page.goto(full_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面HTML内容 + # Get page HTML content page_content = await self.playwright_page.content() - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 成功获取搜索页面HTML,长度: {len(page_content)}") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Successfully retrieved search page HTML, length: {len(page_content)}") - # 提取搜索结果 + # Extract search results notes = self._page_extractor.extract_search_note_list(page_content) - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 提取到 {len(notes)} 条帖子") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Extracted {len(notes)} posts") return notes except Exception as e: - utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] 搜索失败: {e}") + utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] Search failed: {e}") raise async def get_note_by_id(self, note_id: str) -> TiebaNote: """ - 根据帖子ID获取帖子详情 (使用Playwright访问页面,避免API检测) + Get post details by post ID (uses Playwright to access page, avoiding API detection) Args: - note_id: 帖子ID + note_id: Post ID Returns: - TiebaNote: 帖子详情对象 + TiebaNote: Post detail object """ if not self.playwright_page: utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode") raise Exception("playwright_page is required for browser-based note detail fetching") - # 构造帖子详情URL + # Construct post detail URL note_url = f"{self._host}/p/{note_id}" - utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 访问帖子详情页面: {note_url}") + utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Accessing post detail page: {note_url}") try: - # 使用Playwright访问帖子详情页面 + # Use Playwright to access post detail page await self.playwright_page.goto(note_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面HTML内容 + # Get page HTML content page_content = await self.playwright_page.content() - utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 成功获取帖子详情HTML,长度: {len(page_content)}") + utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Successfully retrieved post detail HTML, length: {len(page_content)}") - # 提取帖子详情 + # Extract post details note_detail = self._page_extractor.extract_note_detail(page_content) return note_detail except Exception as e: - utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] 获取帖子详情失败: {e}") + utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] Failed to get post details: {e}") raise async def get_note_all_comments( @@ -343,14 +343,14 @@ class BaiduTieBaClient(AbstractApiClient): max_count: int = 10, ) -> List[TiebaComment]: """ - 获取指定帖子下的所有一级评论 (使用Playwright访问页面,避免API检测) + Get all first-level comments for specified post (uses Playwright to access page, avoiding API detection) Args: - note_detail: 帖子详情对象 - crawl_interval: 爬取一次笔记的延迟单位(秒) - callback: 一次笔记爬取结束后的回调函数 - max_count: 一次帖子爬取的最大评论数量 + note_detail: Post detail object + crawl_interval: Crawl delay interval in seconds + callback: Callback function after one post crawl completes + max_count: Maximum number of comments to crawl per post Returns: - List[TiebaComment]: 评论列表 + List[TiebaComment]: Comment list """ if not self.playwright_page: utils.logger.error("[BaiduTieBaClient.get_note_all_comments] playwright_page is None, cannot use browser mode") @@ -360,30 +360,30 @@ class BaiduTieBaClient(AbstractApiClient): current_page = 1 while note_detail.total_replay_page >= current_page and len(result) < max_count: - # 构造评论页URL + # Construct comment page URL comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}" - utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 访问评论页面: {comment_url}") + utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Accessing comment page: {comment_url}") try: - # 使用Playwright访问评论页面 + # Use Playwright to access comment page await self.playwright_page.goto(comment_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面HTML内容 + # Get page HTML content page_content = await self.playwright_page.content() - # 提取评论 + # Extract comments comments = self._page_extractor.extract_tieba_note_parment_comments( page_content, note_id=note_detail.note_id ) if not comments: - utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取") + utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Page {current_page} has no comments, stopping crawl") break - # 限制评论数量 + # Limit comment count if len(result) + len(comments) > max_count: comments = comments[:max_count - len(result)] @@ -392,7 +392,7 @@ class BaiduTieBaClient(AbstractApiClient): result.extend(comments) - # 获取所有子评论 + # Get all sub-comments await self.get_comments_all_sub_comments( comments, crawl_interval=crawl_interval, callback=callback ) @@ -401,10 +401,10 @@ class BaiduTieBaClient(AbstractApiClient): current_page += 1 except Exception as e: - utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] 获取第{current_page}页评论失败: {e}") + utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] Failed to get page {current_page} comments: {e}") break - utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 共获取 {len(result)} 条一级评论") + utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Total retrieved {len(result)} first-level comments") return result async def get_comments_all_sub_comments( @@ -414,14 +414,14 @@ class BaiduTieBaClient(AbstractApiClient): callback: Optional[Callable] = None, ) -> List[TiebaComment]: """ - 获取指定评论下的所有子评论 (使用Playwright访问页面,避免API检测) + Get all sub-comments for specified comments (uses Playwright to access page, avoiding API detection) Args: - comments: 评论列表 - crawl_interval: 爬取一次笔记的延迟单位(秒) - callback: 一次笔记爬取结束后的回调函数 + comments: Comment list + crawl_interval: Crawl delay interval in seconds + callback: Callback function after one post crawl completes Returns: - List[TiebaComment]: 子评论列表 + List[TiebaComment]: Sub-comment list """ if not config.ENABLE_GET_SUB_COMMENTS: return [] @@ -440,7 +440,7 @@ class BaiduTieBaClient(AbstractApiClient): max_sub_page_num = parment_comment.sub_comment_count // 10 + 1 while max_sub_page_num >= current_page: - # 构造子评论URL + # Construct sub-comment URL sub_comment_url = ( f"{self._host}/p/comment?" f"tid={parment_comment.note_id}&" @@ -448,19 +448,19 @@ class BaiduTieBaClient(AbstractApiClient): f"fid={parment_comment.tieba_id}&" f"pn={current_page}" ) - utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 访问子评论页面: {sub_comment_url}") + utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] Accessing sub-comment page: {sub_comment_url}") try: - # 使用Playwright访问子评论页面 + # Use Playwright to access sub-comment page await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面HTML内容 + # Get page HTML content page_content = await self.playwright_page.content() - # 提取子评论 + # Extract sub-comments sub_comments = self._page_extractor.extract_tieba_note_sub_comments( page_content, parent_comment=parment_comment ) @@ -468,7 +468,7 @@ class BaiduTieBaClient(AbstractApiClient): if not sub_comments: utils.logger.info( f"[BaiduTieBaClient.get_comments_all_sub_comments] " - f"评论{parment_comment.comment_id}第{current_page}页没有子评论,停止爬取" + f"Comment {parment_comment.comment_id} page {current_page} has no sub-comments, stopping crawl" ) break @@ -482,125 +482,125 @@ class BaiduTieBaClient(AbstractApiClient): except Exception as e: utils.logger.error( f"[BaiduTieBaClient.get_comments_all_sub_comments] " - f"获取评论{parment_comment.comment_id}第{current_page}页子评论失败: {e}" + f"Failed to get comment {parment_comment.comment_id} page {current_page} sub-comments: {e}" ) break - utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 共获取 {len(all_sub_comments)} 条子评论") + utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] Total retrieved {len(all_sub_comments)} sub-comments") return all_sub_comments async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]: """ - 根据贴吧名称获取帖子列表 (使用Playwright访问页面,避免API检测) + Get post list by Tieba name (uses Playwright to access page, avoiding API detection) Args: - tieba_name: 贴吧名称 - page_num: 分页页码 + tieba_name: Tieba name + page_num: Page number Returns: - List[TiebaNote]: 帖子列表 + List[TiebaNote]: Post list """ if not self.playwright_page: utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode") raise Exception("playwright_page is required for browser-based tieba note fetching") - # 构造贴吧帖子列表URL + # Construct Tieba post list URL tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}" - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 访问贴吧页面: {tieba_url}") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Accessing Tieba page: {tieba_url}") try: - # 使用Playwright访问贴吧页面 + # Use Playwright to access Tieba page await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面HTML内容 + # Get page HTML content page_content = await self.playwright_page.content() - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 成功获取贴吧页面HTML,长度: {len(page_content)}") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Successfully retrieved Tieba page HTML, length: {len(page_content)}") - # 提取帖子列表 + # Extract post list notes = self._page_extractor.extract_tieba_note_list(page_content) - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 提取到 {len(notes)} 条帖子") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Extracted {len(notes)} posts") return notes except Exception as e: - utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] 获取贴吧帖子列表失败: {e}") + utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] Failed to get Tieba post list: {e}") raise async def get_creator_info_by_url(self, creator_url: str) -> str: """ - 根据创作者URL获取创作者信息 (使用Playwright访问页面,避免API检测) + Get creator information by creator URL (uses Playwright to access page, avoiding API detection) Args: - creator_url: 创作者主页URL + creator_url: Creator homepage URL Returns: - str: 页面HTML内容 + str: Page HTML content """ if not self.playwright_page: utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode") raise Exception("playwright_page is required for browser-based creator info fetching") - utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 访问创作者主页: {creator_url}") + utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Accessing creator homepage: {creator_url}") try: - # 使用Playwright访问创作者主页 + # Use Playwright to access creator homepage await self.playwright_page.goto(creator_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面HTML内容 + # Get page HTML content page_content = await self.playwright_page.content() - utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 成功获取创作者主页HTML,长度: {len(page_content)}") + utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Successfully retrieved creator homepage HTML, length: {len(page_content)}") return page_content except Exception as e: - utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] 获取创作者主页失败: {e}") + utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] Failed to get creator homepage: {e}") raise async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict: """ - 根据创作者获取创作者的帖子 (使用Playwright访问页面,避免API检测) + Get creator's posts by creator (uses Playwright to access page, avoiding API detection) Args: - user_name: 创作者用户名 - page_number: 页码 + user_name: Creator username + page_number: Page number Returns: - Dict: 包含帖子数据的字典 + Dict: Dictionary containing post data """ if not self.playwright_page: utils.logger.error("[BaiduTieBaClient.get_notes_by_creator] playwright_page is None, cannot use browser mode") raise Exception("playwright_page is required for browser-based creator notes fetching") - # 构造创作者帖子列表URL + # Construct creator post list URL creator_url = f"{self._host}/home/get/getthread?un={quote(user_name)}&pn={page_number}&id=utf-8&_={utils.get_current_timestamp()}" - utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 访问创作者帖子列表: {creator_url}") + utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] Accessing creator post list: {creator_url}") try: - # 使用Playwright访问创作者帖子列表页面 + # Use Playwright to access creator post list page await self.playwright_page.goto(creator_url, wait_until="domcontentloaded") - # 等待页面加载,使用配置文件中的延时设置 + # Wait for page loading, using delay setting from config file await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC) - # 获取页面内容(这个接口返回JSON) + # Get page content (this API returns JSON) page_content = await self.playwright_page.content() - # 提取JSON数据(页面会包含
标签或直接是JSON)
+            # Extract JSON data (page will contain 
 tag or is directly JSON)
             try:
-                # 尝试从页面中提取JSON
+                # Try to extract JSON from page
                 json_text = await self.playwright_page.evaluate("() => document.body.innerText")
                 result = json.loads(json_text)
-                utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 成功获取创作者帖子数据")
+                utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] Successfully retrieved creator post data")
                 return result
             except json.JSONDecodeError as e:
-                utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON解析失败: {e}")
-                utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 页面内容: {page_content[:500]}")
+                utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON parsing failed: {e}")
+                utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] Page content: {page_content[:500]}")
                 raise Exception(f"Failed to parse JSON from creator notes page: {e}")
 
         except Exception as e:
-            utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 获取创作者帖子列表失败: {e}")
+            utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] Failed to get creator post list: {e}")
             raise
 
     async def get_all_notes_by_creator_user_name(
@@ -612,18 +612,18 @@ class BaiduTieBaClient(AbstractApiClient):
         creator_page_html_content: str = None,
     ) -> List[TiebaNote]:
         """
-        根据创作者用户名获取创作者所有帖子
+        Get all creator posts by creator username
         Args:
-            user_name: 创作者用户名
-            crawl_interval: 爬取一次笔记的延迟单位(秒)
-            callback: 一次笔记爬取结束后的回调函数,是一个awaitable类型的函数
-            max_note_count: 帖子最大获取数量,如果为0则获取所有
-            creator_page_html_content: 创作者主页HTML内容
+            user_name: Creator username
+            crawl_interval: Crawl delay interval in seconds
+            callback: Callback function after one post crawl completes, an awaitable function
+            max_note_count: Maximum number of posts to retrieve, if 0 then get all
+            creator_page_html_content: Creator homepage HTML content
 
         Returns:
 
         """
-        # 百度贴吧比较特殊一些,前10个帖子是直接展示在主页上的,要单独处理,通过API获取不到
+        # Baidu Tieba is special, the first 10 posts are directly displayed on the homepage and need special handling, cannot be obtained through API
         result: List[TiebaNote] = []
         if creator_page_html_content:
             thread_id_list = (self._page_extractor.extract_tieba_thread_id_list_from_creator_page(creator_page_html_content))
diff --git a/media_platform/tieba/core.py b/media_platform/tieba/core.py
index 8a96120..74d4816 100644
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -79,9 +79,9 @@ class TieBaCrawler(AbstractCrawler):
             )
 
         async with async_playwright() as playwright:
-            # 根据配置选择启动模式
+            # Choose startup mode based on configuration
             if config.ENABLE_CDP_MODE:
-                utils.logger.info("[BaiduTieBaCrawler] 使用CDP模式启动浏览器")
+                utils.logger.info("[BaiduTieBaCrawler] Launching browser in CDP mode")
                 self.browser_context = await self.launch_browser_with_cdp(
                     playwright,
                     playwright_proxy_format,
@@ -89,7 +89,7 @@ class TieBaCrawler(AbstractCrawler):
                     headless=config.CDP_HEADLESS,
                 )
             else:
-                utils.logger.info("[BaiduTieBaCrawler] 使用标准模式启动浏览器")
+                utils.logger.info("[BaiduTieBaCrawler] Launching browser in standard mode")
                 # Launch a browser context.
                 chromium = playwright.chromium
                 self.browser_context = await self.launch_browser(
@@ -99,12 +99,12 @@ class TieBaCrawler(AbstractCrawler):
                     headless=config.HEADLESS,
                 )
 
-            # 注入反检测脚本 - 针对百度的特殊检测
+            # Inject anti-detection scripts - for Baidu's special detection
             await self._inject_anti_detection_scripts()
 
             self.context_page = await self.browser_context.new_page()
 
-            # 先访问百度首页,再点击贴吧链接,避免触发安全验证
+            # First visit Baidu homepage, then click Tieba link to avoid triggering security verification
             await self._navigate_to_tieba_via_baidu()
 
             # Create a client to interact with the baidutieba website.
@@ -399,29 +399,29 @@ class TieBaCrawler(AbstractCrawler):
 
     async def _navigate_to_tieba_via_baidu(self):
         """
-        模拟真实用户访问路径:
-        1. 先访问百度首页 (https://www.baidu.com/)
-        2. 等待页面加载
-        3. 点击顶部导航栏的"贴吧"链接
-        4. 跳转到贴吧首页
+        Simulate real user access path:
+        1. First visit Baidu homepage (https://www.baidu.com/)
+        2. Wait for page to load
+        3. Click "Tieba" link in top navigation bar
+        4. Jump to Tieba homepage
 
-        这样做可以避免触发百度的安全验证
+        This avoids triggering Baidu's security verification
         """
-        utils.logger.info("[TieBaCrawler] 模拟真实用户访问路径...")
+        utils.logger.info("[TieBaCrawler] Simulating real user access path...")
 
         try:
-            # Step 1: 访问百度首页
-            utils.logger.info("[TieBaCrawler] Step 1: 访问百度首页 https://www.baidu.com/")
+            # Step 1: Visit Baidu homepage
+            utils.logger.info("[TieBaCrawler] Step 1: Visiting Baidu homepage https://www.baidu.com/")
             await self.context_page.goto("https://www.baidu.com/", wait_until="domcontentloaded")
 
-            # Step 2: 等待页面加载,使用配置文件中的延时设置
-            utils.logger.info(f"[TieBaCrawler] Step 2: 等待 {config.CRAWLER_MAX_SLEEP_SEC}秒 模拟用户浏览...")
+            # Step 2: Wait for page loading, using delay setting from config file
+            utils.logger.info(f"[TieBaCrawler] Step 2: Waiting {config.CRAWLER_MAX_SLEEP_SEC} seconds to simulate user browsing...")
             await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
 
-            # Step 3: 查找并点击"贴吧"链接
-            utils.logger.info("[TieBaCrawler] Step 3: 查找并点击'贴吧'链接...")
+            # Step 3: Find and click "Tieba" link
+            utils.logger.info("[TieBaCrawler] Step 3: Finding and clicking 'Tieba' link...")
 
-            # 尝试多种选择器,确保能找到贴吧链接
+            # Try multiple selectors to ensure finding the Tieba link
             tieba_selectors = [
                 'a[href="http://tieba.baidu.com/"]',
                 'a[href="https://tieba.baidu.com/"]',
@@ -434,74 +434,74 @@ class TieBaCrawler(AbstractCrawler):
                 try:
                     tieba_link = await self.context_page.wait_for_selector(selector, timeout=5000)
                     if tieba_link:
-                        utils.logger.info(f"[TieBaCrawler] 找到贴吧链接 (selector: {selector})")
+                        utils.logger.info(f"[TieBaCrawler] Found Tieba link (selector: {selector})")
                         break
                 except Exception:
                     continue
 
             if not tieba_link:
-                utils.logger.warning("[TieBaCrawler] 未找到贴吧链接,直接访问贴吧首页")
+                utils.logger.warning("[TieBaCrawler] Tieba link not found, directly accessing Tieba homepage")
                 await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
                 return
 
-            # Step 4: 点击贴吧链接 (检查是否会打开新标签页)
-            utils.logger.info("[TieBaCrawler] Step 4: 点击贴吧链接...")
+            # Step 4: Click Tieba link (check if it will open in a new tab)
+            utils.logger.info("[TieBaCrawler] Step 4: Clicking Tieba link...")
 
-            # 检查链接的target属性
+            # Check link's target attribute
             target_attr = await tieba_link.get_attribute("target")
-            utils.logger.info(f"[TieBaCrawler] 链接target属性: {target_attr}")
+            utils.logger.info(f"[TieBaCrawler] Link target attribute: {target_attr}")
 
             if target_attr == "_blank":
-                # 如果是新标签页,需要等待新页面并切换
-                utils.logger.info("[TieBaCrawler] 链接会在新标签页打开,等待新页面...")
+                # If it's a new tab, need to wait for new page and switch
+                utils.logger.info("[TieBaCrawler] Link will open in new tab, waiting for new page...")
 
                 async with self.browser_context.expect_page() as new_page_info:
                     await tieba_link.click()
 
-                # 获取新打开的页面
+                # Get newly opened page
                 new_page = await new_page_info.value
                 await new_page.wait_for_load_state("domcontentloaded")
 
-                # 关闭旧的百度首页
+                # Close old Baidu homepage
                 await self.context_page.close()
 
-                # 切换到新的贴吧页面
+                # Switch to new Tieba page
                 self.context_page = new_page
-                utils.logger.info("[TieBaCrawler] ✅ 已切换到新标签页 (贴吧页面)")
+                utils.logger.info("[TieBaCrawler] Successfully switched to new tab (Tieba page)")
             else:
-                # 如果是同一标签页跳转,正常等待导航
-                utils.logger.info("[TieBaCrawler] 链接在当前标签页跳转...")
+                # If it's same tab navigation, wait for navigation normally
+                utils.logger.info("[TieBaCrawler] Link navigates in current tab...")
                 async with self.context_page.expect_navigation(wait_until="domcontentloaded"):
                     await tieba_link.click()
 
-            # Step 5: 等待页面稳定,使用配置文件中的延时设置
-            utils.logger.info(f"[TieBaCrawler] Step 5: 页面加载完成,等待 {config.CRAWLER_MAX_SLEEP_SEC}秒...")
+            # Step 5: Wait for page to stabilize, using delay setting from config file
+            utils.logger.info(f"[TieBaCrawler] Step 5: Page loaded, waiting {config.CRAWLER_MAX_SLEEP_SEC} seconds...")
             await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
 
             current_url = self.context_page.url
-            utils.logger.info(f"[TieBaCrawler] ✅ 成功通过百度首页进入贴吧! 当前URL: {current_url}")
+            utils.logger.info(f"[TieBaCrawler] Successfully entered Tieba via Baidu homepage! Current URL: {current_url}")
 
         except Exception as e:
-            utils.logger.error(f"[TieBaCrawler] 通过百度首页访问贴吧失败: {e}")
-            utils.logger.info("[TieBaCrawler] 回退:直接访问贴吧首页")
+            utils.logger.error(f"[TieBaCrawler] Failed to access Tieba via Baidu homepage: {e}")
+            utils.logger.info("[TieBaCrawler] Fallback: directly accessing Tieba homepage")
             await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
 
     async def _inject_anti_detection_scripts(self):
         """
-        注入反检测JavaScript脚本
-        针对百度贴吧的特殊检测机制
+        Inject anti-detection JavaScript scripts
+        For Baidu Tieba's special detection mechanism
         """
         utils.logger.info("[TieBaCrawler] Injecting anti-detection scripts...")
 
-        # 轻量级反检测脚本,只覆盖关键检测点
+        # Lightweight anti-detection script, only covering key detection points
         anti_detection_js = """
-        // 覆盖 navigator.webdriver
+        // Override navigator.webdriver
         Object.defineProperty(navigator, 'webdriver', {
             get: () => undefined,
             configurable: true
         });
 
-        // 覆盖 window.navigator.chrome
+        // Override window.navigator.chrome
         if (!window.navigator.chrome) {
             window.navigator.chrome = {
                 runtime: {},
@@ -511,7 +511,7 @@ class TieBaCrawler(AbstractCrawler):
             };
         }
 
-        // 覆盖 Permissions API
+        // Override Permissions API
         const originalQuery = window.navigator.permissions.query;
         window.navigator.permissions.query = (parameters) => (
             parameters.name === 'notifications' ?
@@ -519,19 +519,19 @@ class TieBaCrawler(AbstractCrawler):
                 originalQuery(parameters)
         );
 
-        // 覆盖 plugins 长度(让它看起来有插件)
+        // Override plugins length (make it look like there are plugins)
         Object.defineProperty(navigator, 'plugins', {
             get: () => [1, 2, 3, 4, 5],
             configurable: true
         });
 
-        // 覆盖 languages
+        // Override languages
         Object.defineProperty(navigator, 'languages', {
             get: () => ['zh-CN', 'zh', 'en'],
             configurable: true
         });
 
-        // 移除 window.cdc_ 等 ChromeDriver 残留
+        // Remove window.cdc_ and other ChromeDriver remnants
         delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
         delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
         delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
@@ -548,21 +548,21 @@ class TieBaCrawler(AbstractCrawler):
         """
         Create tieba client with real browser User-Agent and complete headers
         Args:
-            httpx_proxy: HTTP代理
-            ip_pool: IP代理池
+            httpx_proxy: HTTP proxy
+            ip_pool: IP proxy pool
 
         Returns:
-            BaiduTieBaClient实例
+            BaiduTieBaClient instance
         """
         utils.logger.info("[TieBaCrawler.create_tieba_client] Begin create tieba API client...")
 
-        # 从真实浏览器提取User-Agent,避免被检测
+        # Extract User-Agent from real browser to avoid detection
         user_agent = await self.context_page.evaluate("() => navigator.userAgent")
         utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")
 
         cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
 
-        # 构建完整的浏览器请求头,模拟真实浏览器行为
+        # Build complete browser request headers, simulating real browser behavior
         tieba_client = BaiduTieBaClient(
             timeout=10,
             ip_pool=ip_pool,
@@ -572,7 +572,7 @@ class TieBaCrawler(AbstractCrawler):
                 "Accept-Language": "zh-CN,zh;q=0.9",
                 "Accept-Encoding": "gzip, deflate, br",
                 "Connection": "keep-alive",
-                "User-Agent": user_agent,  # 使用真实浏览器的UA
+                "User-Agent": user_agent,  # Use real browser UA
                 "Cookie": cookie_str,
                 "Host": "tieba.baidu.com",
                 "Referer": "https://tieba.baidu.com/",
@@ -585,7 +585,7 @@ class TieBaCrawler(AbstractCrawler):
                 "sec-ch-ua-mobile": "?0",
                 "sec-ch-ua-platform": '"macOS"',
             },
-            playwright_page=self.context_page,  # 传入playwright页面对象
+            playwright_page=self.context_page,  # Pass in playwright page object
         )
         return tieba_client
 
@@ -623,7 +623,7 @@ class TieBaCrawler(AbstractCrawler):
                 proxy=playwright_proxy,  # type: ignore
                 viewport={"width": 1920, "height": 1080},
                 user_agent=user_agent,
-                channel="chrome",  # 使用系统的Chrome稳定版
+                channel="chrome",  # Use system's stable Chrome version
             )
             return browser_context
         else:
@@ -641,7 +641,7 @@ class TieBaCrawler(AbstractCrawler):
         headless: bool = True,
     ) -> BrowserContext:
         """
-        使用CDP模式启动浏览器
+        Launch browser using CDP mode
         """
         try:
             self.cdp_manager = CDPBrowserManager()
@@ -652,15 +652,15 @@ class TieBaCrawler(AbstractCrawler):
                 headless=headless,
             )
 
-            # 显示浏览器信息
+            # Display browser information
             browser_info = await self.cdp_manager.get_browser_info()
-            utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}")
+            utils.logger.info(f"[TieBaCrawler] CDP browser info: {browser_info}")
 
             return browser_context
 
         except Exception as e:
-            utils.logger.error(f"[TieBaCrawler] CDP模式启动失败,回退到标准模式: {e}")
-            # 回退到标准模式
+            utils.logger.error(f"[TieBaCrawler] CDP mode launch failed, falling back to standard mode: {e}")
+            # Fall back to standard mode
             chromium = playwright.chromium
             return await self.launch_browser(
                 chromium, playwright_proxy, user_agent, headless
@@ -672,7 +672,7 @@ class TieBaCrawler(AbstractCrawler):
         Returns:
 
         """
-        # 如果使用CDP模式,需要特殊处理
+        # If using CDP mode, need special handling
         if self.cdp_manager:
             await self.cdp_manager.cleanup()
             self.cdp_manager = None
diff --git a/media_platform/tieba/field.py b/media_platform/tieba/field.py
index 23ed40b..71da44e 100644
--- a/media_platform/tieba/field.py
+++ b/media_platform/tieba/field.py
@@ -23,16 +23,16 @@ from enum import Enum
 
 class SearchSortType(Enum):
     """search sort type"""
-    # 按时间倒序
+    # Sort by time in descending order
     TIME_DESC = "1"
-    # 按时间顺序
+    # Sort by time in ascending order
     TIME_ASC = "0"
-    # 按相关性顺序
+    # Sort by relevance
     RELEVANCE_ORDER = "2"
 
 
 class SearchNoteType(Enum):
-    # 只看主题贴
+    # Only view main posts
     MAIN_THREAD = "1"
-    # 混合模式(帖子+回复)
+    # Mixed mode (posts + replies)
     FIXED_THREAD = "0"
diff --git a/media_platform/tieba/help.py b/media_platform/tieba/help.py
index f276752..a91a67d 100644
--- a/media_platform/tieba/help.py
+++ b/media_platform/tieba/help.py
@@ -42,12 +42,12 @@ class TieBaExtractor:
     @staticmethod
     def extract_search_note_list(page_content: str) -> List[TiebaNote]:
         """
-        提取贴吧帖子列表,这里提取的关键词搜索结果页的数据,还缺少帖子的回复数和回复页等数据
+        Extract Tieba post list from keyword search result pages, still missing reply count and reply page data
         Args:
-            page_content: 页面内容的HTML字符串
+            page_content: HTML string of page content
 
         Returns:
-            包含帖子信息的字典列表
+            List of Tieba post objects
         """
         xpath_selector = "//div[@class='s_post']"
         post_list = Selector(text=page_content).xpath(xpath_selector)
@@ -71,12 +71,12 @@ class TieBaExtractor:
 
     def extract_tieba_note_list(self, page_content: str) -> List[TiebaNote]:
         """
-        提取贴吧帖子列表
+        Extract Tieba post list from Tieba page
         Args:
-            page_content:
+            page_content: HTML string of page content
 
         Returns:
-
+            List of Tieba post objects
         """
         page_content = page_content.replace('