i18n: translate all Chinese comments, docstrings, and logger messages to English

Comprehensive translation of Chinese text to English across the entire codebase: - api/: FastAPI server documentation and logger messages - cache/: Cache abstraction layer comments and docstrings - database/: Database models and MongoDB store documentation - media_platform/: All platform crawlers (Bilibili, Douyin, Kuaishou, Tieba, Weibo, Xiaohongshu, Zhihu) - model/: Data model documentation - proxy/: Proxy pool and provider documentation - store/: Data storage layer comments - tools/: Utility functions and browser automation - test/: Test file documentation Preserved: Chinese disclaimer header (lines 10-18) for legal compliance 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-06-08 10:57:26 +08:00 · 2025-12-26 23:27:19 +08:00
parent 1544d13dd5
commit 157ddfb21b
93 changed files with 1971 additions and 1955 deletions
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -79,9 +79,9 @@ class TieBaCrawler(AbstractCrawler):
            )

        async with async_playwright() as playwright:
-            # 根据配置选择启动模式
+            # Choose startup mode based on configuration
            if config.ENABLE_CDP_MODE:
-                utils.logger.info("[BaiduTieBaCrawler] 使用CDP模式启动浏览器")
+                utils.logger.info("[BaiduTieBaCrawler] Launching browser in CDP mode")
                self.browser_context = await self.launch_browser_with_cdp(
                    playwright,
                    playwright_proxy_format,
@@ -89,7 +89,7 @@ class TieBaCrawler(AbstractCrawler):
                    headless=config.CDP_HEADLESS,
                )
            else:
-                utils.logger.info("[BaiduTieBaCrawler] 使用标准模式启动浏览器")
+                utils.logger.info("[BaiduTieBaCrawler] Launching browser in standard mode")
                # Launch a browser context.
                chromium = playwright.chromium
                self.browser_context = await self.launch_browser(
@@ -99,12 +99,12 @@ class TieBaCrawler(AbstractCrawler):
                    headless=config.HEADLESS,
                )

-            # 注入反检测脚本 - 针对百度的特殊检测
+            # Inject anti-detection scripts - for Baidu's special detection
            await self._inject_anti_detection_scripts()

            self.context_page = await self.browser_context.new_page()

-            # 先访问百度首页,再点击贴吧链接,避免触发安全验证
+            # First visit Baidu homepage, then click Tieba link to avoid triggering security verification
            await self._navigate_to_tieba_via_baidu()

            # Create a client to interact with the baidutieba website.
@@ -399,29 +399,29 @@ class TieBaCrawler(AbstractCrawler):

    async def _navigate_to_tieba_via_baidu(self):
        """
-        模拟真实用户访问路径:
-        1. 先访问百度首页 (https://www.baidu.com/)
-        2. 等待页面加载
-        3. 点击顶部导航栏的"贴吧"链接
-        4. 跳转到贴吧首页
+        Simulate real user access path:
+        1. First visit Baidu homepage (https://www.baidu.com/)
+        2. Wait for page to load
+        3. Click "Tieba" link in top navigation bar
+        4. Jump to Tieba homepage

-        这样做可以避免触发百度的安全验证
+        This avoids triggering Baidu's security verification
        """
-        utils.logger.info("[TieBaCrawler] 模拟真实用户访问路径...")
+        utils.logger.info("[TieBaCrawler] Simulating real user access path...")

        try:
-            # Step 1: 访问百度首页
-            utils.logger.info("[TieBaCrawler] Step 1: 访问百度首页 https://www.baidu.com/")
+            # Step 1: Visit Baidu homepage
+            utils.logger.info("[TieBaCrawler] Step 1: Visiting Baidu homepage https://www.baidu.com/")
            await self.context_page.goto("https://www.baidu.com/", wait_until="domcontentloaded")

-            # Step 2: 等待页面加载,使用配置文件中的延时设置
-            utils.logger.info(f"[TieBaCrawler] Step 2: 等待 {config.CRAWLER_MAX_SLEEP_SEC}秒 模拟用户浏览...")
+            # Step 2: Wait for page loading, using delay setting from config file
+            utils.logger.info(f"[TieBaCrawler] Step 2: Waiting {config.CRAWLER_MAX_SLEEP_SEC} seconds to simulate user browsing...")
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

-            # Step 3: 查找并点击"贴吧"链接
-            utils.logger.info("[TieBaCrawler] Step 3: 查找并点击'贴吧'链接...")
+            # Step 3: Find and click "Tieba" link
+            utils.logger.info("[TieBaCrawler] Step 3: Finding and clicking 'Tieba' link...")

-            # 尝试多种选择器,确保能找到贴吧链接
+            # Try multiple selectors to ensure finding the Tieba link
            tieba_selectors = [
                'a[href="http://tieba.baidu.com/"]',
                'a[href="https://tieba.baidu.com/"]',
@@ -434,74 +434,74 @@ class TieBaCrawler(AbstractCrawler):
                try:
                    tieba_link = await self.context_page.wait_for_selector(selector, timeout=5000)
                    if tieba_link:
-                        utils.logger.info(f"[TieBaCrawler] 找到贴吧链接 (selector: {selector})")
+                        utils.logger.info(f"[TieBaCrawler] Found Tieba link (selector: {selector})")
                        break
                except Exception:
                    continue

            if not tieba_link:
-                utils.logger.warning("[TieBaCrawler] 未找到贴吧链接,直接访问贴吧首页")
+                utils.logger.warning("[TieBaCrawler] Tieba link not found, directly accessing Tieba homepage")
                await self.context_page.goto(self.index_url, wait_until="domcontentloaded")
                return

-            # Step 4: 点击贴吧链接 (检查是否会打开新标签页)
-            utils.logger.info("[TieBaCrawler] Step 4: 点击贴吧链接...")
+            # Step 4: Click Tieba link (check if it will open in a new tab)
+            utils.logger.info("[TieBaCrawler] Step 4: Clicking Tieba link...")

-            # 检查链接的target属性
+            # Check link's target attribute
            target_attr = await tieba_link.get_attribute("target")
-            utils.logger.info(f"[TieBaCrawler] 链接target属性: {target_attr}")
+            utils.logger.info(f"[TieBaCrawler] Link target attribute: {target_attr}")

            if target_attr == "_blank":
-                # 如果是新标签页,需要等待新页面并切换
-                utils.logger.info("[TieBaCrawler] 链接会在新标签页打开,等待新页面...")
+                # If it's a new tab, need to wait for new page and switch
+                utils.logger.info("[TieBaCrawler] Link will open in new tab, waiting for new page...")

                async with self.browser_context.expect_page() as new_page_info:
                    await tieba_link.click()

-                # 获取新打开的页面
+                # Get newly opened page
                new_page = await new_page_info.value
                await new_page.wait_for_load_state("domcontentloaded")

-                # 关闭旧的百度首页
+                # Close old Baidu homepage
                await self.context_page.close()

-                # 切换到新的贴吧页面
+                # Switch to new Tieba page
                self.context_page = new_page
-                utils.logger.info("[TieBaCrawler] ✅ 已切换到新标签页 (贴吧页面)")
+                utils.logger.info("[TieBaCrawler] Successfully switched to new tab (Tieba page)")
            else:
-                # 如果是同一标签页跳转,正常等待导航
-                utils.logger.info("[TieBaCrawler] 链接在当前标签页跳转...")
+                # If it's same tab navigation, wait for navigation normally
+                utils.logger.info("[TieBaCrawler] Link navigates in current tab...")
                async with self.context_page.expect_navigation(wait_until="domcontentloaded"):
                    await tieba_link.click()

-            # Step 5: 等待页面稳定,使用配置文件中的延时设置
-            utils.logger.info(f"[TieBaCrawler] Step 5: 页面加载完成,等待 {config.CRAWLER_MAX_SLEEP_SEC}秒...")
+            # Step 5: Wait for page to stabilize, using delay setting from config file
+            utils.logger.info(f"[TieBaCrawler] Step 5: Page loaded, waiting {config.CRAWLER_MAX_SLEEP_SEC} seconds...")
            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)

            current_url = self.context_page.url
-            utils.logger.info(f"[TieBaCrawler] ✅ 成功通过百度首页进入贴吧! 当前URL: {current_url}")
+            utils.logger.info(f"[TieBaCrawler] Successfully entered Tieba via Baidu homepage! Current URL: {current_url}")

        except Exception as e:
-            utils.logger.error(f"[TieBaCrawler] 通过百度首页访问贴吧失败: {e}")
-            utils.logger.info("[TieBaCrawler] 回退:直接访问贴吧首页")
+            utils.logger.error(f"[TieBaCrawler] Failed to access Tieba via Baidu homepage: {e}")
+            utils.logger.info("[TieBaCrawler] Fallback: directly accessing Tieba homepage")
            await self.context_page.goto(self.index_url, wait_until="domcontentloaded")

    async def _inject_anti_detection_scripts(self):
        """
-        注入反检测JavaScript脚本
-        针对百度贴吧的特殊检测机制
+        Inject anti-detection JavaScript scripts
+        For Baidu Tieba's special detection mechanism
        """
        utils.logger.info("[TieBaCrawler] Injecting anti-detection scripts...")

-        # 轻量级反检测脚本,只覆盖关键检测点
+        # Lightweight anti-detection script, only covering key detection points
        anti_detection_js = """
-        // 覆盖 navigator.webdriver
+        // Override navigator.webdriver
        Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined,
            configurable: true
        });

-        // 覆盖 window.navigator.chrome
+        // Override window.navigator.chrome
        if (!window.navigator.chrome) {
            window.navigator.chrome = {
                runtime: {},
@@ -511,7 +511,7 @@ class TieBaCrawler(AbstractCrawler):
            };
        }

-        // 覆盖 Permissions API
+        // Override Permissions API
        const originalQuery = window.navigator.permissions.query;
        window.navigator.permissions.query = (parameters) => (
            parameters.name === 'notifications' ?
@@ -519,19 +519,19 @@ class TieBaCrawler(AbstractCrawler):
                originalQuery(parameters)
        );

-        // 覆盖 plugins 长度(让它看起来有插件)
+        // Override plugins length (make it look like there are plugins)
        Object.defineProperty(navigator, 'plugins', {
            get: () => [1, 2, 3, 4, 5],
            configurable: true
        });

-        // 覆盖 languages
+        // Override languages
        Object.defineProperty(navigator, 'languages', {
            get: () => ['zh-CN', 'zh', 'en'],
            configurable: true
        });

-        // 移除 window.cdc_ 等 ChromeDriver 残留
+        // Remove window.cdc_ and other ChromeDriver remnants
        delete window.cdc_adoQpoasnfa76pfcZLmcfl_Array;
        delete window.cdc_adoQpoasnfa76pfcZLmcfl_Promise;
        delete window.cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
@@ -548,21 +548,21 @@ class TieBaCrawler(AbstractCrawler):
        """
        Create tieba client with real browser User-Agent and complete headers
        Args:
-            httpx_proxy: HTTP代理
-            ip_pool: IP代理池
+            httpx_proxy: HTTP proxy
+            ip_pool: IP proxy pool

        Returns:
-            BaiduTieBaClient实例
+            BaiduTieBaClient instance
        """
        utils.logger.info("[TieBaCrawler.create_tieba_client] Begin create tieba API client...")

-        # 从真实浏览器提取User-Agent,避免被检测
+        # Extract User-Agent from real browser to avoid detection
        user_agent = await self.context_page.evaluate("() => navigator.userAgent")
        utils.logger.info(f"[TieBaCrawler.create_tieba_client] Extracted User-Agent from browser: {user_agent}")

        cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())

-        # 构建完整的浏览器请求头,模拟真实浏览器行为
+        # Build complete browser request headers, simulating real browser behavior
        tieba_client = BaiduTieBaClient(
            timeout=10,
            ip_pool=ip_pool,
@@ -572,7 +572,7 @@ class TieBaCrawler(AbstractCrawler):
                "Accept-Language": "zh-CN,zh;q=0.9",
                "Accept-Encoding": "gzip, deflate, br",
                "Connection": "keep-alive",
-                "User-Agent": user_agent,  # 使用真实浏览器的UA
+                "User-Agent": user_agent,  # Use real browser UA
                "Cookie": cookie_str,
                "Host": "tieba.baidu.com",
                "Referer": "https://tieba.baidu.com/",
@@ -585,7 +585,7 @@ class TieBaCrawler(AbstractCrawler):
                "sec-ch-ua-mobile": "?0",
                "sec-ch-ua-platform": '"macOS"',
            },
-            playwright_page=self.context_page,  # 传入playwright页面对象
+            playwright_page=self.context_page,  # Pass in playwright page object
        )
        return tieba_client

@@ -623,7 +623,7 @@ class TieBaCrawler(AbstractCrawler):
                proxy=playwright_proxy,  # type: ignore
                viewport={"width": 1920, "height": 1080},
                user_agent=user_agent,
-                channel="chrome",  # 使用系统的Chrome稳定版
+                channel="chrome",  # Use system's stable Chrome version
            )
            return browser_context
        else:
@@ -641,7 +641,7 @@ class TieBaCrawler(AbstractCrawler):
        headless: bool = True,
    ) -> BrowserContext:
        """
-        使用CDP模式启动浏览器
+        Launch browser using CDP mode
        """
        try:
            self.cdp_manager = CDPBrowserManager()
@@ -652,15 +652,15 @@ class TieBaCrawler(AbstractCrawler):
                headless=headless,
            )

-            # 显示浏览器信息
+            # Display browser information
            browser_info = await self.cdp_manager.get_browser_info()
-            utils.logger.info(f"[TieBaCrawler] CDP浏览器信息: {browser_info}")
+            utils.logger.info(f"[TieBaCrawler] CDP browser info: {browser_info}")

            return browser_context

        except Exception as e:
-            utils.logger.error(f"[TieBaCrawler] CDP模式启动失败，回退到标准模式: {e}")
-            # 回退到标准模式
+            utils.logger.error(f"[TieBaCrawler] CDP mode launch failed, falling back to standard mode: {e}")
+            # Fall back to standard mode
            chromium = playwright.chromium
            return await self.launch_browser(
                chromium, playwright_proxy, user_agent, headless
@@ -672,7 +672,7 @@ class TieBaCrawler(AbstractCrawler):
        Returns:

        """
-        # 如果使用CDP模式，需要特殊处理
+        # If using CDP mode, need special handling
        if self.cdp_manager:
            await self.cdp_manager.cleanup()
            self.cdp_manager = None