mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-21 18:17:26 +08:00
fix: restore Tieba crawling after PC page rewrite
Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode. Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign Confidence: high Scope-risk: moderate Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3 Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
This commit is contained in:
@@ -22,6 +22,7 @@ from __future__ import annotations
|
||||
|
||||
|
||||
import sys
|
||||
import re
|
||||
from enum import Enum
|
||||
from types import SimpleNamespace
|
||||
from typing import Iterable, Optional, Sequence, Type, TypeVar
|
||||
@@ -135,6 +136,21 @@ def _inject_init_db_default(args: Sequence[str]) -> list[str]:
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_tieba_note_id(value: str) -> str:
|
||||
"""Accept a raw Tieba thread id or a /p/<id> URL."""
|
||||
value = value.strip()
|
||||
match = re.search(r"/p/(\d+)", value)
|
||||
return match.group(1) if match else value
|
||||
|
||||
|
||||
def _normalize_tieba_creator_url(value: str) -> str:
|
||||
"""Accept a Tieba creator homepage URL or a portrait id."""
|
||||
value = value.strip()
|
||||
if value.startswith("http://") or value.startswith("https://"):
|
||||
return value
|
||||
return f"https://tieba.baidu.com/home/main?id={value}"
|
||||
|
||||
|
||||
async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
"""Parse command line arguments using Typer."""
|
||||
|
||||
@@ -344,6 +360,10 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
config.WEIBO_SPECIFIED_ID_LIST = specified_id_list
|
||||
elif platform == PlatformEnum.KUAISHOU:
|
||||
config.KS_SPECIFIED_ID_LIST = specified_id_list
|
||||
elif platform == PlatformEnum.TIEBA:
|
||||
config.TIEBA_SPECIFIED_ID_LIST = [
|
||||
_normalize_tieba_note_id(item) for item in specified_id_list
|
||||
]
|
||||
|
||||
if creator_id_list:
|
||||
if platform == PlatformEnum.XHS:
|
||||
@@ -356,6 +376,10 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
config.WEIBO_CREATOR_ID_LIST = creator_id_list
|
||||
elif platform == PlatformEnum.KUAISHOU:
|
||||
config.KS_CREATOR_ID_LIST = creator_id_list
|
||||
elif platform == PlatformEnum.TIEBA:
|
||||
config.TIEBA_CREATOR_URL_LIST = [
|
||||
_normalize_tieba_creator_url(item) for item in creator_id_list
|
||||
]
|
||||
|
||||
return SimpleNamespace(
|
||||
platform=config.PLATFORM,
|
||||
|
||||
Reference in New Issue
Block a user