fix: restore Tieba crawling after PC page rewrite

Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode.

Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction
Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention
Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign
Confidence: high
Scope-risk: moderate
Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests
Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q
Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py
Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false
Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3
Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false
Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser
Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
This commit is contained in:
程序员阿江(Relakkes)
2026-04-30 18:20:46 +08:00
parent 1572b64334
commit f328ee35b5
7 changed files with 1308 additions and 176 deletions

View File

@@ -213,7 +213,7 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
tieba_limit_count = 50
tieba_limit_count = 30
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
for tieba_name in config.TIEBA_NAME_LIST:
@@ -245,7 +245,7 @@ class TieBaCrawler(AbstractCrawler):
page_number += tieba_limit_count
async def get_specified_notes(
self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
self, note_id_list: Optional[List[str]] = None
):
"""
Get the information and comments of the specified post
@@ -255,6 +255,8 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
if note_id_list is None:
note_id_list = config.TIEBA_SPECIFIED_ID_LIST
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
@@ -365,18 +367,15 @@ class TieBaCrawler(AbstractCrawler):
"""
utils.logger.info(
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
"[TieBaCrawler.get_creators_and_notes] Begin get tieba creators"
)
for creator_url in config.TIEBA_CREATOR_URL_LIST:
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(
creator_url=creator_url
)
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
creator_page_html_content
)
if creator_info:
utils.logger.info(
f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
f"[TieBaCrawler.get_creators_and_notes] creator info: {creator_info}"
)
if not creator_info:
raise Exception("Get creator info error")
@@ -385,12 +384,11 @@ class TieBaCrawler(AbstractCrawler):
# Get all note information of the creator
all_notes_list = (
await self.tieba_client.get_all_notes_by_creator_user_name(
user_name=creator_info.user_name,
await self.tieba_client.get_all_notes_by_creator_url(
creator_url=creator_url,
crawl_interval=0,
callback=tieba_store.batch_update_tieba_notes,
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
creator_page_html_content=creator_page_html_content,
)
)
@@ -398,7 +396,7 @@ class TieBaCrawler(AbstractCrawler):
else:
utils.logger.error(
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
f"[TieBaCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
)
async def _navigate_to_tieba_via_baidu(self):