refactor: tieba 改为浏览器获取数据

2026-06-09 03:17:25 +08:00 · 2025-10-19 17:09:55 +08:00
parent 26a261bc09
commit ed6e0bfb5f
3 changed files with 606 additions and 128 deletions
--- a/media_platform/tieba/client.py
+++ b/media_platform/tieba/client.py
@@ -11,10 +11,10 @@
 import asyncio
 import json
 from typing import Any, Callable, Dict, List, Optional, Union
-from urllib.parse import urlencode
+from urllib.parse import urlencode, quote

-import httpx
-from playwright.async_api import BrowserContext
+import requests
+from playwright.async_api import BrowserContext, Page
 from tenacity import RetryError, retry, stop_after_attempt, wait_fixed

 import config
@@ -34,34 +34,76 @@ class BaiduTieBaClient(AbstractApiClient):
        timeout=10,
        ip_pool=None,
        default_ip_proxy=None,
+        headers: Dict[str, str] = None,
+        playwright_page: Optional[Page] = None,
    ):
        self.ip_pool: Optional[ProxyIpPool] = ip_pool
        self.timeout = timeout
-        self.headers = {
+        # 使用传入的headers(包含真实浏览器UA)或默认headers
+        self.headers = headers or {
            "User-Agent": utils.get_user_agent(),
-            "Cookies": "",
+            "Cookie": "",
        }
        self._host = "https://tieba.baidu.com"
        self._page_extractor = TieBaExtractor()
        self.default_ip_proxy = default_ip_proxy
+        self.playwright_page = playwright_page  # Playwright页面对象
+
+    def _sync_request(self, method, url, proxy=None, **kwargs):
+        """
+        同步的requests请求方法
+        Args:
+            method: 请求方法
+            url: 请求的URL
+            proxy: 代理IP
+            **kwargs: 其他请求参数
+
+        Returns:
+            response对象
+        """
+        # 构造代理字典
+        proxies = None
+        if proxy:
+            proxies = {
+                "http": proxy,
+                "https": proxy,
+            }
+
+        # 发送请求
+        response = requests.request(
+            method=method,
+            url=url,
+            headers=self.headers,
+            proxies=proxies,
+            timeout=self.timeout,
+            **kwargs
+        )
+        return response

    @retry(stop=stop_after_attempt(3), wait=wait_fixed(1))
    async def request(self, method, url, return_ori_content=False, proxy=None, **kwargs) -> Union[str, Any]:
        """
-        封装httpx的公共请求方法，对请求响应做一些处理
+        封装requests的公共请求方法，对请求响应做一些处理
        Args:
            method: 请求方法
            url: 请求的URL
            return_ori_content: 是否返回原始内容
-            proxies: 代理IP
+            proxy: 代理IP
            **kwargs: 其他请求参数，例如请求头、请求体等

        Returns:

        """
        actual_proxy = proxy if proxy else self.default_ip_proxy
-        async with httpx.AsyncClient(proxy=actual_proxy) as client:
-            response = await client.request(method, url, timeout=self.timeout, headers=self.headers, **kwargs)
+
+        # 在线程池中执行同步的requests请求
+        response = await asyncio.to_thread(
+            self._sync_request,
+            method,
+            url,
+            actual_proxy,
+            **kwargs
+        )

        if response.status_code != 200:
            utils.logger.error(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")
@@ -69,7 +111,7 @@ class BaiduTieBaClient(AbstractApiClient):
            raise Exception(f"Request failed, method: {method}, url: {url}, status code: {response.status_code}")

        if response.text == "" or response.text == "blocked":
-            utils.logger.error(f"request params incrr, response.text: {response.text}")
+            utils.logger.error(f"request params incorrect, response.text: {response.text}")
            raise Exception("account blocked")

        if return_ori_content:
@@ -119,26 +161,41 @@ class BaiduTieBaClient(AbstractApiClient):
        json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False)
        return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, **kwargs)

-    async def pong(self) -> bool:
+    async def pong(self, browser_context: BrowserContext = None) -> bool:
        """
        用于检查登录态是否失效了
-        Returns:
+        使用Cookie检测而非API调用,避免被检测
+        Args:
+            browser_context: 浏览器上下文对象

+        Returns:
+            bool: True表示已登录,False表示未登录
        """
-        utils.logger.info("[BaiduTieBaClient.pong] Begin to pong tieba...")
+        utils.logger.info("[BaiduTieBaClient.pong] Begin to check tieba login state by cookies...")
+
+        if not browser_context:
+            utils.logger.warning("[BaiduTieBaClient.pong] browser_context is None, assume not logged in")
+            return False
+
        try:
-            uri = "/mo/q/sync"
-            res: Dict = await self.get(uri)
-            utils.logger.info(f"[BaiduTieBaClient.pong] res: {res}")
-            if res and res.get("no") == 0:
-                ping_flag = True
+            # 从浏览器获取cookies并检查关键登录cookie
+            _, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+
+            # 百度贴吧的登录标识: STOKEN 或 PTOKEN
+            stoken = cookie_dict.get("STOKEN")
+            ptoken = cookie_dict.get("PTOKEN")
+            bduss = cookie_dict.get("BDUSS")  # 百度通用登录cookie
+
+            if stoken or ptoken or bduss:
+                utils.logger.info(f"[BaiduTieBaClient.pong] Login state verified by cookies (STOKEN: {bool(stoken)}, PTOKEN: {bool(ptoken)}, BDUSS: {bool(bduss)})")
+                return True
            else:
-                utils.logger.info(f"[BaiduTieBaClient.pong] user not login, will try to login again...")
-                ping_flag = False
+                utils.logger.info("[BaiduTieBaClient.pong] No valid login cookies found, need to login")
+                return False
+
        except Exception as e:
-            utils.logger.error(f"[BaiduTieBaClient.pong] Ping tieba failed: {e}, and try to login again...")
-            ping_flag = False
-        return ping_flag
+            utils.logger.error(f"[BaiduTieBaClient.pong] Check login state failed: {e}, assume not logged in")
+            return False

    async def update_cookies(self, browser_context: BrowserContext):
        """
@@ -149,7 +206,9 @@ class BaiduTieBaClient(AbstractApiClient):
        Returns:

        """
-        pass
+        cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
+        self.headers["Cookie"] = cookie_str
+        utils.logger.info("[BaiduTieBaClient.update_cookies] Cookie has been updated")

    async def get_notes_by_keyword(
        self,
@@ -160,7 +219,7 @@ class BaiduTieBaClient(AbstractApiClient):
        note_type: SearchNoteType = SearchNoteType.FIXED_THREAD,
    ) -> List[TiebaNote]:
        """
-        根据关键词搜索贴吧帖子
+        根据关键词搜索贴吧帖子 (使用Playwright访问页面,避免API检测)
        Args:
            keyword: 关键词
            page: 分页第几页
@@ -170,30 +229,81 @@ class BaiduTieBaClient(AbstractApiClient):
        Returns:

        """
-        uri = "/f/search/res"
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based search")
+
+        # 构造搜索URL
+        # 示例: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=编程
+        search_url = f"{self._host}/f/search/res"
        params = {
-            "isnew": 1,
+            "ie": "utf-8",
            "qw": keyword,
            "rn": page_size,
            "pn": page,
            "sm": sort.value,
            "only_thread": note_type.value,
        }
-        page_content = await self.get(uri, params=params, return_ori_content=True)
-        return self._page_extractor.extract_search_note_list(page_content)
+
+        # 拼接完整URL
+        full_url = f"{search_url}?{urlencode(params)}"
+        utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 访问搜索页面: {full_url}")
+
+        try:
+            # 使用Playwright访问搜索页面
+            await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
+
+            # 等待页面加载,使用配置文件中的延时设置
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+            # 获取页面HTML内容
+            page_content = await self.playwright_page.content()
+            utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 成功获取搜索页面HTML,长度: {len(page_content)}")
+
+            # 提取搜索结果
+            notes = self._page_extractor.extract_search_note_list(page_content)
+            utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] 提取到 {len(notes)} 条帖子")
+            return notes
+
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaClient.get_notes_by_keyword] 搜索失败: {e}")
+            raise

    async def get_note_by_id(self, note_id: str) -> TiebaNote:
        """
-        根据帖子ID获取帖子详情
+        根据帖子ID获取帖子详情 (使用Playwright访问页面,避免API检测)
        Args:
-            note_id:
+            note_id: 帖子ID

        Returns:
-
+            TiebaNote: 帖子详情对象
        """
-        uri = f"/p/{note_id}"
-        page_content = await self.get(uri, return_ori_content=True)
-        return self._page_extractor.extract_note_detail(page_content)
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based note detail fetching")
+
+        # 构造帖子详情URL
+        note_url = f"{self._host}/p/{note_id}"
+        utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 访问帖子详情页面: {note_url}")
+
+        try:
+            # 使用Playwright访问帖子详情页面
+            await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
+
+            # 等待页面加载,使用配置文件中的延时设置
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+            # 获取页面HTML内容
+            page_content = await self.playwright_page.content()
+            utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] 成功获取帖子详情HTML,长度: {len(page_content)}")
+
+            # 提取帖子详情
+            note_detail = self._page_extractor.extract_note_detail(page_content)
+            return note_detail
+
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaClient.get_note_by_id] 获取帖子详情失败: {e}")
+            raise

    async def get_note_all_comments(
        self,
@@ -203,35 +313,68 @@ class BaiduTieBaClient(AbstractApiClient):
        max_count: int = 10,
    ) -> List[TiebaComment]:
        """
-        获取指定帖子下的所有一级评论，该方法会一直查找一个帖子下的所有评论信息
+        获取指定帖子下的所有一级评论 (使用Playwright访问页面,避免API检测)
        Args:
            note_detail: 帖子详情对象
            crawl_interval: 爬取一次笔记的延迟单位（秒）
-            callback: 一次笔记爬取结束后
+            callback: 一次笔记爬取结束后的回调函数
            max_count: 一次帖子爬取的最大评论数量
        Returns:
-
+            List[TiebaComment]: 评论列表
        """
-        uri = f"/p/{note_detail.note_id}"
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_note_all_comments] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based comment fetching")
+
        result: List[TiebaComment] = []
        current_page = 1
+
        while note_detail.total_replay_page >= current_page and len(result) < max_count:
-            params = {
-                "pn": current_page,
-            }
-            page_content = await self.get(uri, params=params, return_ori_content=True)
-            comments = self._page_extractor.extract_tieba_note_parment_comments(page_content, note_id=note_detail.note_id)
-            if not comments:
+            # 构造评论页URL
+            comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}"
+            utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 访问评论页面: {comment_url}")
+
+            try:
+                # 使用Playwright访问评论页面
+                await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
+
+                # 等待页面加载,使用配置文件中的延时设置
+                await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+                # 获取页面HTML内容
+                page_content = await self.playwright_page.content()
+
+                # 提取评论
+                comments = self._page_extractor.extract_tieba_note_parment_comments(
+                    page_content, note_id=note_detail.note_id
+                )
+
+                if not comments:
+                    utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 第{current_page}页没有评论,停止爬取")
+                    break
+
+                # 限制评论数量
+                if len(result) + len(comments) > max_count:
+                    comments = comments[:max_count - len(result)]
+
+                if callback:
+                    await callback(note_detail.note_id, comments)
+
+                result.extend(comments)
+
+                # 获取所有子评论
+                await self.get_comments_all_sub_comments(
+                    comments, crawl_interval=crawl_interval, callback=callback
+                )
+
+                await asyncio.sleep(crawl_interval)
+                current_page += 1
+
+            except Exception as e:
+                utils.logger.error(f"[BaiduTieBaClient.get_note_all_comments] 获取第{current_page}页评论失败: {e}")
                break
-            if len(result) + len(comments) > max_count:
-                comments = comments[:max_count - len(result)]
-            if callback:
-                await callback(note_detail.note_id, comments)
-            result.extend(comments)
-            # 获取所有子评论
-            await self.get_comments_all_sub_comments(comments, crawl_interval=crawl_interval, callback=callback)
-            await asyncio.sleep(crawl_interval)
-            current_page += 1
+
+        utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] 共获取 {len(result)} 条一级评论")
        return result

    async def get_comments_all_sub_comments(
@@ -241,93 +384,194 @@ class BaiduTieBaClient(AbstractApiClient):
        callback: Optional[Callable] = None,
    ) -> List[TiebaComment]:
        """
-        获取指定评论下的所有子评论
+        获取指定评论下的所有子评论 (使用Playwright访问页面,避免API检测)
        Args:
            comments: 评论列表
            crawl_interval: 爬取一次笔记的延迟单位（秒）
-            callback: 一次笔记爬取结束后
+            callback: 一次笔记爬取结束后的回调函数

        Returns:
-
+            List[TiebaComment]: 子评论列表
        """
-        uri = "/p/comment"
        if not config.ENABLE_GET_SUB_COMMENTS:
            return []

-        # # 贴吧获取所有子评论需要登录态
-        # if self.headers.get("Cookies") == "" or not self.pong():
-        #     raise Exception(f"[BaiduTieBaClient.pong] Cookies is empty, please login first...")
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_comments_all_sub_comments] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based sub-comment fetching")

        all_sub_comments: List[TiebaComment] = []
+
        for parment_comment in comments:
            if parment_comment.sub_comment_count == 0:
                continue

            current_page = 1
            max_sub_page_num = parment_comment.sub_comment_count // 10 + 1
-            while max_sub_page_num >= current_page:
-                params = {
-                    "tid": parment_comment.note_id,  # 帖子ID
-                    "pid": parment_comment.comment_id,  # 父级评论ID
-                    "fid": parment_comment.tieba_id,  # 贴吧ID
-                    "pn": current_page  # 页码
-                }
-                page_content = await self.get(uri, params=params, return_ori_content=True)
-                sub_comments = self._page_extractor.extract_tieba_note_sub_comments(page_content, parent_comment=parment_comment)

-                if not sub_comments:
+            while max_sub_page_num >= current_page:
+                # 构造子评论URL
+                sub_comment_url = (
+                    f"{self._host}/p/comment?"
+                    f"tid={parment_comment.note_id}&"
+                    f"pid={parment_comment.comment_id}&"
+                    f"fid={parment_comment.tieba_id}&"
+                    f"pn={current_page}"
+                )
+                utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 访问子评论页面: {sub_comment_url}")
+
+                try:
+                    # 使用Playwright访问子评论页面
+                    await self.playwright_page.goto(sub_comment_url, wait_until="domcontentloaded")
+
+                    # 等待页面加载,使用配置文件中的延时设置
+                    await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+                    # 获取页面HTML内容
+                    page_content = await self.playwright_page.content()
+
+                    # 提取子评论
+                    sub_comments = self._page_extractor.extract_tieba_note_sub_comments(
+                        page_content, parent_comment=parment_comment
+                    )
+
+                    if not sub_comments:
+                        utils.logger.info(
+                            f"[BaiduTieBaClient.get_comments_all_sub_comments] "
+                            f"评论{parment_comment.comment_id}第{current_page}页没有子评论,停止爬取"
+                        )
+                        break
+
+                    if callback:
+                        await callback(parment_comment.note_id, sub_comments)
+
+                    all_sub_comments.extend(sub_comments)
+                    await asyncio.sleep(crawl_interval)
+                    current_page += 1
+
+                except Exception as e:
+                    utils.logger.error(
+                        f"[BaiduTieBaClient.get_comments_all_sub_comments] "
+                        f"获取评论{parment_comment.comment_id}第{current_page}页子评论失败: {e}"
+                    )
                    break
-                if callback:
-                    await callback(parment_comment.note_id, sub_comments)
-                all_sub_comments.extend(sub_comments)
-                await asyncio.sleep(crawl_interval)
-                current_page += 1
+
+        utils.logger.info(f"[BaiduTieBaClient.get_comments_all_sub_comments] 共获取 {len(all_sub_comments)} 条子评论")
        return all_sub_comments

    async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
        """
-        根据贴吧名称获取帖子列表
+        根据贴吧名称获取帖子列表 (使用Playwright访问页面,避免API检测)
        Args:
            tieba_name: 贴吧名称
-            page_num: 分页数量
+            page_num: 分页页码

        Returns:
-
+            List[TiebaNote]: 帖子列表
        """
-        uri = f"/f?kw={tieba_name}&pn={page_num}"
-        page_content = await self.get(uri, return_ori_content=True)
-        return self._page_extractor.extract_tieba_note_list(page_content)
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based tieba note fetching")
+
+        # 构造贴吧帖子列表URL
+        tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}"
+        utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 访问贴吧页面: {tieba_url}")
+
+        try:
+            # 使用Playwright访问贴吧页面
+            await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
+
+            # 等待页面加载,使用配置文件中的延时设置
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+            # 获取页面HTML内容
+            page_content = await self.playwright_page.content()
+            utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 成功获取贴吧页面HTML,长度: {len(page_content)}")
+
+            # 提取帖子列表
+            notes = self._page_extractor.extract_tieba_note_list(page_content)
+            utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] 提取到 {len(notes)} 条帖子")
+            return notes
+
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] 获取贴吧帖子列表失败: {e}")
+            raise

    async def get_creator_info_by_url(self, creator_url: str) -> str:
        """
-        根据创作者ID获取创作者信息
+        根据创作者URL获取创作者信息 (使用Playwright访问页面,避免API检测)
        Args:
            creator_url: 创作者主页URL

        Returns:
-
+            str: 页面HTML内容
        """
-        page_content = await self.request(method="GET", url=creator_url, return_ori_content=True)
-        return page_content
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based creator info fetching")
+
+        utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 访问创作者主页: {creator_url}")
+
+        try:
+            # 使用Playwright访问创作者主页
+            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
+
+            # 等待页面加载,使用配置文件中的延时设置
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+            # 获取页面HTML内容
+            page_content = await self.playwright_page.content()
+            utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] 成功获取创作者主页HTML,长度: {len(page_content)}")
+
+            return page_content
+
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] 获取创作者主页失败: {e}")
+            raise

    async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
        """
-        根据创作者获取创作者的所有帖子
+        根据创作者获取创作者的帖子 (使用Playwright访问页面,避免API检测)
        Args:
-            user_name:
-            page_number:
+            user_name: 创作者用户名
+            page_number: 页码

        Returns:
-
+            Dict: 包含帖子数据的字典
        """
-        uri = f"/home/get/getthread"
-        params = {
-            "un": user_name,
-            "pn": page_number,
-            "id": "utf-8",
-            "_": utils.get_current_timestamp(),
-        }
-        return await self.get(uri, params=params)
+        if not self.playwright_page:
+            utils.logger.error("[BaiduTieBaClient.get_notes_by_creator] playwright_page is None, cannot use browser mode")
+            raise Exception("playwright_page is required for browser-based creator notes fetching")
+
+        # 构造创作者帖子列表URL
+        creator_url = f"{self._host}/home/get/getthread?un={quote(user_name)}&pn={page_number}&id=utf-8&_={utils.get_current_timestamp()}"
+        utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 访问创作者帖子列表: {creator_url}")
+
+        try:
+            # 使用Playwright访问创作者帖子列表页面
+            await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
+
+            # 等待页面加载,使用配置文件中的延时设置
+            await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
+
+            # 获取页面内容(这个接口返回JSON)
+            page_content = await self.playwright_page.content()
+
+            # 提取JSON数据(页面会包含<pre>标签或直接是JSON)
+            try:
+                # 尝试从页面中提取JSON
+                json_text = await self.playwright_page.evaluate("() => document.body.innerText")
+                result = json.loads(json_text)
+                utils.logger.info(f"[BaiduTieBaClient.get_notes_by_creator] 成功获取创作者帖子数据")
+                return result
+            except json.JSONDecodeError as e:
+                utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] JSON解析失败: {e}")
+                utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 页面内容: {page_content[:500]}")
+                raise Exception(f"Failed to parse JSON from creator notes page: {e}")
+
+        except Exception as e:
+            utils.logger.error(f"[BaiduTieBaClient.get_notes_by_creator] 获取创作者帖子列表失败: {e}")
+            raise

    async def get_all_notes_by_creator_user_name(
        self,