fix: restore Tieba crawling after PC page rewrite

Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode. Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign Confidence: high Scope-risk: moderate Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3 Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
2026-05-27 21:17:27 +08:00 · 2026-04-30 18:20:46 +08:00
parent 1572b64334
commit f328ee35b5
7 changed files with 1308 additions and 176 deletions
--- a/media_platform/tieba/core.py
+++ b/media_platform/tieba/core.py
@@ -213,7 +213,7 @@ class TieBaCrawler(AbstractCrawler):
        Returns:

        """
-        tieba_limit_count = 50
+        tieba_limit_count = 30
        if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
            config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
        for tieba_name in config.TIEBA_NAME_LIST:
@@ -245,7 +245,7 @@ class TieBaCrawler(AbstractCrawler):
                page_number += tieba_limit_count

    async def get_specified_notes(
-        self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
+        self, note_id_list: Optional[List[str]] = None
    ):
        """
        Get the information and comments of the specified post
@@ -255,6 +255,8 @@ class TieBaCrawler(AbstractCrawler):
        Returns:

        """
+        if note_id_list is None:
+            note_id_list = config.TIEBA_SPECIFIED_ID_LIST
        semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
        task_list = [
            self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
@@ -365,18 +367,15 @@ class TieBaCrawler(AbstractCrawler):

        """
        utils.logger.info(
-            "[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
+            "[TieBaCrawler.get_creators_and_notes] Begin get tieba creators"
        )
        for creator_url in config.TIEBA_CREATOR_URL_LIST:
-            creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
+            creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(
                creator_url=creator_url
            )
-            creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
-                creator_page_html_content
-            )
            if creator_info:
                utils.logger.info(
-                    f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
+                    f"[TieBaCrawler.get_creators_and_notes] creator info: {creator_info}"
                )
                if not creator_info:
                    raise Exception("Get creator info error")
@@ -385,12 +384,11 @@ class TieBaCrawler(AbstractCrawler):

                # Get all note information of the creator
                all_notes_list = (
-                    await self.tieba_client.get_all_notes_by_creator_user_name(
-                        user_name=creator_info.user_name,
+                    await self.tieba_client.get_all_notes_by_creator_url(
+                        creator_url=creator_url,
                        crawl_interval=0,
                        callback=tieba_store.batch_update_tieba_notes,
                        max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
-                        creator_page_html_content=creator_page_html_content,
                    )
                )

@@ -398,7 +396,7 @@ class TieBaCrawler(AbstractCrawler):

            else:
                utils.logger.error(
-                    f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
+                    f"[TieBaCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
                )

    async def _navigate_to_tieba_via_baidu(self):