i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase:

- api/: FastAPI server documentation and logger messages
- cache/: Cache abstraction layer comments and docstrings
- database/: Database models and MongoDB store documentation
- media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu)
- model/: Data model documentation
- proxy/: Proxy pool and provider documentation
- store/: Data storage layer comments
- tools/: Utility functions and browser automation
- test/: Test file documentation

Preserved: Chinese disclaimer header (lines 10-18) for legal compliance

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
程序员阿江(Relakkes)
2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions

View File

@@ -34,7 +34,7 @@ from tools import utils
class CDPBrowserManager:
"""
CDP浏览器管理器负责启动和管理通过CDP连接的浏览器
CDP browser manager, responsible for launching and managing browsers connected via CDP
"""
def __init__(self):
@@ -46,27 +46,27 @@ class CDPBrowserManager:
def _register_cleanup_handlers(self):
"""
注册清理处理器,确保程序退出时清理浏览器进程
Register cleanup handlers to ensure browser process cleanup on program exit
"""
if self._cleanup_registered:
return
def sync_cleanup():
"""同步清理函数,用于atexit"""
"""Synchronous cleanup function for atexit"""
if self.launcher and self.launcher.browser_process:
utils.logger.info("[CDPBrowserManager] atexit: 清理浏览器进程")
utils.logger.info("[CDPBrowserManager] atexit: Cleaning up browser process")
self.launcher.cleanup()
# 注册atexit清理
# Register atexit cleanup
atexit.register(sync_cleanup)
# 注册信号处理器(仅在没有自定义处理器时注册,避免覆盖主入口的信号处理逻辑)
# Register signal handlers (only when no custom handlers exist, to avoid overriding main entry signal handling logic)
prev_sigint = signal.getsignal(signal.SIGINT)
prev_sigterm = signal.getsignal(signal.SIGTERM)
def signal_handler(signum, frame):
"""信号处理器"""
utils.logger.info(f"[CDPBrowserManager] 收到信号 {signum},清理浏览器进程")
"""Signal handler"""
utils.logger.info(f"[CDPBrowserManager] Received signal {signum}, cleaning up browser process")
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
@@ -80,19 +80,19 @@ class CDPBrowserManager:
install_sigint = prev_sigint in (signal.default_int_handler, signal.SIG_DFL)
install_sigterm = prev_sigterm == signal.SIG_DFL
# 注册SIGINT (Ctrl+C) SIGTERM
# Register SIGINT (Ctrl+C) and SIGTERM
if install_sigint:
signal.signal(signal.SIGINT, signal_handler)
else:
utils.logger.info("[CDPBrowserManager] 已存在SIGINT处理器,跳过注册以避免覆盖")
utils.logger.info("[CDPBrowserManager] SIGINT handler already exists, skipping registration to avoid override")
if install_sigterm:
signal.signal(signal.SIGTERM, signal_handler)
else:
utils.logger.info("[CDPBrowserManager] 已存在SIGTERM处理器,跳过注册以避免覆盖")
utils.logger.info("[CDPBrowserManager] SIGTERM handler already exists, skipping registration to avoid override")
self._cleanup_registered = True
utils.logger.info("[CDPBrowserManager] 清理处理器已注册")
utils.logger.info("[CDPBrowserManager] Cleanup handlers registered")
async def launch_and_connect(
self,
@@ -102,25 +102,25 @@ class CDPBrowserManager:
headless: bool = False,
) -> BrowserContext:
"""
启动浏览器并通过CDP连接
Launch browser and connect via CDP
"""
try:
# 1. 检测浏览器路径
# 1. Detect browser path
browser_path = await self._get_browser_path()
# 2. 获取可用端口
# 2. Get available port
self.debug_port = self.launcher.find_available_port(config.CDP_DEBUG_PORT)
# 3. 启动浏览器
# 3. Launch browser
await self._launch_browser(browser_path, headless)
# 4. 注册清理处理器(确保异常退出时也能清理)
# 4. Register cleanup handlers (ensure cleanup on abnormal exit)
self._register_cleanup_handlers()
# 5. 通过CDP连接
# 5. Connect via CDP
await self._connect_via_cdp(playwright)
# 5. 创建浏览器上下文
# 6. Create browser context
browser_context = await self._create_browser_context(
playwright_proxy, user_agent
)
@@ -129,68 +129,68 @@ class CDPBrowserManager:
return browser_context
except Exception as e:
utils.logger.error(f"[CDPBrowserManager] CDP浏览器启动失败: {e}")
utils.logger.error(f"[CDPBrowserManager] CDP browser launch failed: {e}")
await self.cleanup()
raise
async def _get_browser_path(self) -> str:
"""
获取浏览器路径
Get browser path
"""
# 优先使用用户自定义路径
# Prefer user-defined path
if config.CUSTOM_BROWSER_PATH and os.path.isfile(config.CUSTOM_BROWSER_PATH):
utils.logger.info(
f"[CDPBrowserManager] 使用自定义浏览器路径: {config.CUSTOM_BROWSER_PATH}"
f"[CDPBrowserManager] Using custom browser path: {config.CUSTOM_BROWSER_PATH}"
)
return config.CUSTOM_BROWSER_PATH
# 自动检测浏览器路径
# Auto-detect browser path
browser_paths = self.launcher.detect_browser_paths()
if not browser_paths:
raise RuntimeError(
"未找到可用的浏览器。请确保已安装Chrome或Edge浏览器"
"或在配置文件中设置CUSTOM_BROWSER_PATH指定浏览器路径。"
"No available browser found. Please ensure Chrome or Edge browser is installed, "
"or set CUSTOM_BROWSER_PATH in config file to specify browser path."
)
browser_path = browser_paths[0] # 使用第一个找到的浏览器
browser_path = browser_paths[0] # Use the first browser found
browser_name, browser_version = self.launcher.get_browser_info(browser_path)
utils.logger.info(
f"[CDPBrowserManager] 检测到浏览器: {browser_name} ({browser_version})"
f"[CDPBrowserManager] Detected browser: {browser_name} ({browser_version})"
)
utils.logger.info(f"[CDPBrowserManager] 浏览器路径: {browser_path}")
utils.logger.info(f"[CDPBrowserManager] Browser path: {browser_path}")
return browser_path
async def _test_cdp_connection(self, debug_port: int) -> bool:
"""
测试CDP连接是否可用
Test if CDP connection is available
"""
try:
# 简单的socket连接测试
# Simple socket connection test
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.settimeout(5)
result = s.connect_ex(("localhost", debug_port))
if result == 0:
utils.logger.info(
f"[CDPBrowserManager] CDP端口 {debug_port} 可访问"
f"[CDPBrowserManager] CDP port {debug_port} is accessible"
)
return True
else:
utils.logger.warning(
f"[CDPBrowserManager] CDP端口 {debug_port} 不可访问"
f"[CDPBrowserManager] CDP port {debug_port} is not accessible"
)
return False
except Exception as e:
utils.logger.warning(f"[CDPBrowserManager] CDP连接测试失败: {e}")
utils.logger.warning(f"[CDPBrowserManager] CDP connection test failed: {e}")
return False
async def _launch_browser(self, browser_path: str, headless: bool):
"""
启动浏览器进程
Launch browser process
"""
# 设置用户数据目录(如果启用了保存登录状态)
# Set user data directory (if save login state is enabled)
user_data_dir = None
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(
@@ -199,9 +199,9 @@ class CDPBrowserManager:
f"cdp_{config.USER_DATA_DIR % config.PLATFORM}",
)
os.makedirs(user_data_dir, exist_ok=True)
utils.logger.info(f"[CDPBrowserManager] 用户数据目录: {user_data_dir}")
utils.logger.info(f"[CDPBrowserManager] User data directory: {user_data_dir}")
# 启动浏览器
# Launch browser
self.launcher.browser_process = self.launcher.launch_browser(
browser_path=browser_path,
debug_port=self.debug_port,
@@ -209,24 +209,24 @@ class CDPBrowserManager:
user_data_dir=user_data_dir,
)
# 等待浏览器准备就绪
# Wait for browser to be ready
if not self.launcher.wait_for_browser_ready(
self.debug_port, config.BROWSER_LAUNCH_TIMEOUT
):
raise RuntimeError(f"浏览器在 {config.BROWSER_LAUNCH_TIMEOUT} 秒内未能启动")
raise RuntimeError(f"Browser failed to start within {config.BROWSER_LAUNCH_TIMEOUT} seconds")
# 额外等待一秒让CDP服务完全启动
# Extra wait for CDP service to fully start
await asyncio.sleep(1)
# 测试CDP连接
# Test CDP connection
if not await self._test_cdp_connection(self.debug_port):
utils.logger.warning(
"[CDPBrowserManager] CDP连接测试失败,但将继续尝试连接"
"[CDPBrowserManager] CDP connection test failed, but will continue to try connecting"
)
async def _get_browser_websocket_url(self, debug_port: int) -> str:
"""
获取浏览器的WebSocket连接URL
Get browser WebSocket connection URL
"""
try:
async with httpx.AsyncClient() as client:
@@ -238,196 +238,196 @@ class CDPBrowserManager:
ws_url = data.get("webSocketDebuggerUrl")
if ws_url:
utils.logger.info(
f"[CDPBrowserManager] 获取到浏览器WebSocket URL: {ws_url}"
f"[CDPBrowserManager] Got browser WebSocket URL: {ws_url}"
)
return ws_url
else:
raise RuntimeError("未找到webSocketDebuggerUrl")
raise RuntimeError("webSocketDebuggerUrl not found")
else:
raise RuntimeError(f"HTTP {response.status_code}: {response.text}")
except Exception as e:
utils.logger.error(f"[CDPBrowserManager] 获取WebSocket URL失败: {e}")
utils.logger.error(f"[CDPBrowserManager] Failed to get WebSocket URL: {e}")
raise
async def _connect_via_cdp(self, playwright: Playwright):
"""
通过CDP连接到浏览器
Connect to browser via CDP
"""
try:
# 获取正确的WebSocket URL
# Get correct WebSocket URL
ws_url = await self._get_browser_websocket_url(self.debug_port)
utils.logger.info(f"[CDPBrowserManager] 正在通过CDP连接到浏览器: {ws_url}")
utils.logger.info(f"[CDPBrowserManager] Connecting to browser via CDP: {ws_url}")
# 使用PlaywrightconnectOverCDP方法连接
# Use Playwright's connectOverCDP method to connect
self.browser = await playwright.chromium.connect_over_cdp(ws_url)
if self.browser.is_connected():
utils.logger.info("[CDPBrowserManager] 成功连接到浏览器")
utils.logger.info("[CDPBrowserManager] Successfully connected to browser")
utils.logger.info(
f"[CDPBrowserManager] 浏览器上下文数量: {len(self.browser.contexts)}"
f"[CDPBrowserManager] Browser contexts count: {len(self.browser.contexts)}"
)
else:
raise RuntimeError("CDP连接失败")
raise RuntimeError("CDP connection failed")
except Exception as e:
utils.logger.error(f"[CDPBrowserManager] CDP连接失败: {e}")
utils.logger.error(f"[CDPBrowserManager] CDP connection failed: {e}")
raise
async def _create_browser_context(
self, playwright_proxy: Optional[Dict] = None, user_agent: Optional[str] = None
) -> BrowserContext:
"""
创建或获取浏览器上下文
Create or get browser context
"""
if not self.browser:
raise RuntimeError("浏览器未连接")
raise RuntimeError("Browser not connected")
# 获取现有上下文或创建新的上下文
# Get existing context or create new context
contexts = self.browser.contexts
if contexts:
# 使用现有的第一个上下文
# Use existing first context
browser_context = contexts[0]
utils.logger.info("[CDPBrowserManager] 使用现有的浏览器上下文")
utils.logger.info("[CDPBrowserManager] Using existing browser context")
else:
# 创建新的上下文
# Create new context
context_options = {
"viewport": {"width": 1920, "height": 1080},
"accept_downloads": True,
}
# 设置用户代理
# Set user agent
if user_agent:
context_options["user_agent"] = user_agent
utils.logger.info(f"[CDPBrowserManager] 设置用户代理: {user_agent}")
utils.logger.info(f"[CDPBrowserManager] Setting user agent: {user_agent}")
# 注意CDP模式下代理设置可能不生效因为浏览器已经启动
# Note: Proxy settings may not work in CDP mode since browser is already launched
if playwright_proxy:
utils.logger.warning(
"[CDPBrowserManager] 警告: CDP模式下代理设置可能不生效"
"建议在浏览器启动前配置系统代理或浏览器代理扩展"
"[CDPBrowserManager] Warning: Proxy settings may not work in CDP mode, "
"recommend configuring system proxy or browser proxy extension before launching browser"
)
browser_context = await self.browser.new_context(**context_options)
utils.logger.info("[CDPBrowserManager] 创建新的浏览器上下文")
utils.logger.info("[CDPBrowserManager] Created new browser context")
return browser_context
async def add_stealth_script(self, script_path: str = "libs/stealth.min.js"):
"""
添加反检测脚本
Add anti-detection script
"""
if self.browser_context and os.path.exists(script_path):
try:
await self.browser_context.add_init_script(path=script_path)
utils.logger.info(
f"[CDPBrowserManager] 已添加反检测脚本: {script_path}"
f"[CDPBrowserManager] Added anti-detection script: {script_path}"
)
except Exception as e:
utils.logger.warning(f"[CDPBrowserManager] 添加反检测脚本失败: {e}")
utils.logger.warning(f"[CDPBrowserManager] Failed to add anti-detection script: {e}")
async def add_cookies(self, cookies: list):
"""
添加Cookie
Add cookies
"""
if self.browser_context:
try:
await self.browser_context.add_cookies(cookies)
utils.logger.info(f"[CDPBrowserManager] 已添加 {len(cookies)} 个Cookie")
utils.logger.info(f"[CDPBrowserManager] Added {len(cookies)} cookies")
except Exception as e:
utils.logger.warning(f"[CDPBrowserManager] 添加Cookie失败: {e}")
utils.logger.warning(f"[CDPBrowserManager] Failed to add cookies: {e}")
async def get_cookies(self) -> list:
"""
获取当前Cookie
Get current cookies
"""
if self.browser_context:
try:
cookies = await self.browser_context.cookies()
return cookies
except Exception as e:
utils.logger.warning(f"[CDPBrowserManager] 获取Cookie失败: {e}")
utils.logger.warning(f"[CDPBrowserManager] Failed to get cookies: {e}")
return []
return []
async def cleanup(self, force: bool = False):
"""
清理资源
Cleanup resources
Args:
force: 是否强制清理浏览器进程(忽略AUTO_CLOSE_BROWSER配置)
force: Whether to force cleanup browser process (ignoring AUTO_CLOSE_BROWSER config)
"""
try:
# 关闭浏览器上下文
# Close browser context
if self.browser_context:
try:
# 检查上下文是否已经关闭
# 尝试获取页面列表,如果失败说明已经关闭
# Check if context is already closed
# Try to get page list, if fails means already closed
try:
pages = self.browser_context.pages
if pages is not None:
await self.browser_context.close()
utils.logger.info("[CDPBrowserManager] 浏览器上下文已关闭")
utils.logger.info("[CDPBrowserManager] Browser context closed")
except:
utils.logger.debug("[CDPBrowserManager] 浏览器上下文已经被关闭")
utils.logger.debug("[CDPBrowserManager] Browser context already closed")
except Exception as context_error:
# 只在错误不是因为已关闭时才记录警告
# Only log warning if error is not due to already being closed
error_msg = str(context_error).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器上下文失败: {context_error}"
f"[CDPBrowserManager] Failed to close browser context: {context_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器上下文已关闭: {context_error}")
utils.logger.debug(f"[CDPBrowserManager] Browser context already closed: {context_error}")
finally:
self.browser_context = None
# 断开浏览器连接
# Disconnect browser
if self.browser:
try:
# 检查浏览器是否仍然连接
# Check if browser is still connected
if self.browser.is_connected():
await self.browser.close()
utils.logger.info("[CDPBrowserManager] 浏览器连接已断开")
utils.logger.info("[CDPBrowserManager] Browser connection disconnected")
else:
utils.logger.debug("[CDPBrowserManager] 浏览器连接已经断开")
utils.logger.debug("[CDPBrowserManager] Browser connection already disconnected")
except Exception as browser_error:
# 只在错误不是因为已关闭时才记录警告
# Only log warning if error is not due to already being closed
error_msg = str(browser_error).lower()
if "closed" not in error_msg and "disconnected" not in error_msg:
utils.logger.warning(
f"[CDPBrowserManager] 关闭浏览器连接失败: {browser_error}"
f"[CDPBrowserManager] Failed to close browser connection: {browser_error}"
)
else:
utils.logger.debug(f"[CDPBrowserManager] 浏览器连接已关闭: {browser_error}")
utils.logger.debug(f"[CDPBrowserManager] Browser connection already closed: {browser_error}")
finally:
self.browser = None
# 关闭浏览器进程
# force=True 时强制关闭,忽略AUTO_CLOSE_BROWSER配置
# 这用于处理异常退出或手动清理的情况
# Close browser process
# force=True means force close, ignoring AUTO_CLOSE_BROWSER config
# Used for handling abnormal exit or manual cleanup
if force or config.AUTO_CLOSE_BROWSER:
if self.launcher and self.launcher.browser_process:
self.launcher.cleanup()
else:
utils.logger.debug("[CDPBrowserManager] 没有需要清理的浏览器进程")
utils.logger.debug("[CDPBrowserManager] No browser process to cleanup")
else:
utils.logger.info(
"[CDPBrowserManager] 浏览器进程保持运行(AUTO_CLOSE_BROWSER=False"
"[CDPBrowserManager] Browser process kept running (AUTO_CLOSE_BROWSER=False)"
)
except Exception as e:
utils.logger.error(f"[CDPBrowserManager] 清理资源时出错: {e}")
utils.logger.error(f"[CDPBrowserManager] Error during resource cleanup: {e}")
def is_connected(self) -> bool:
"""
检查是否已连接到浏览器
Check if connected to browser
"""
return self.browser is not None and self.browser.is_connected()
async def get_browser_info(self) -> Dict[str, Any]:
"""
获取浏览器信息
Get browser info
"""
if not self.browser:
return {}
@@ -443,5 +443,5 @@ class CDPBrowserManager:
"is_connected": self.is_connected(),
}
except Exception as e:
utils.logger.warning(f"[CDPBrowserManager] 获取浏览器信息失败: {e}")
utils.logger.warning(f"[CDPBrowserManager] Failed to get browser info: {e}")
return {}