mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-21 18:17:26 +08:00
fix: restore Tieba crawling after PC page rewrite
Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode. Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign Confidence: high Scope-risk: moderate Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3 Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
This commit is contained in:
62
tests/test_cmd_arg_tieba.py
Normal file
62
tests/test_cmd_arg_tieba.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import config
|
||||
import pytest
|
||||
from cmd_arg import parse_cmd
|
||||
from media_platform.tieba import TieBaCrawler
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_detail_cli_sets_specified_ids():
|
||||
await parse_cmd(
|
||||
[
|
||||
"--platform",
|
||||
"tieba",
|
||||
"--type",
|
||||
"detail",
|
||||
"--specified_id",
|
||||
"https://tieba.baidu.com/p/10451142633,9835114923",
|
||||
]
|
||||
)
|
||||
|
||||
assert config.TIEBA_SPECIFIED_ID_LIST == ["10451142633", "9835114923"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_creator_cli_sets_creator_urls():
|
||||
await parse_cmd(
|
||||
[
|
||||
"--platform",
|
||||
"tieba",
|
||||
"--type",
|
||||
"creator",
|
||||
"--creator_id",
|
||||
"tb.1.example,https://tieba.baidu.com/home/main?id=tb.1.raw",
|
||||
]
|
||||
)
|
||||
|
||||
assert config.TIEBA_CREATOR_URL_LIST == [
|
||||
"https://tieba.baidu.com/home/main?id=tb.1.example",
|
||||
"https://tieba.baidu.com/home/main?id=tb.1.raw",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_detail_reads_runtime_specified_ids(monkeypatch):
|
||||
crawler = TieBaCrawler()
|
||||
seen_note_ids = []
|
||||
|
||||
async def fake_get_note_detail(note_id, semaphore):
|
||||
seen_note_ids.append(note_id)
|
||||
return None
|
||||
|
||||
async def fake_batch_get_comments(note_details):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(config, "TIEBA_SPECIFIED_ID_LIST", ["10451142633"])
|
||||
monkeypatch.setattr(crawler, "get_note_detail_async_task", fake_get_note_detail)
|
||||
monkeypatch.setattr(crawler, "batch_get_note_comments", fake_batch_get_comments)
|
||||
|
||||
await crawler.get_specified_notes()
|
||||
|
||||
assert seen_note_ids == ["10451142633"]
|
||||
Reference in New Issue
Block a user