mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-10 20:47:39 +08:00
Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode. Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign Confidence: high Scope-risk: moderate Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3 Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
111 lines
3.4 KiB
Python
111 lines
3.4 KiB
Python
# -*- coding: utf-8 -*-
|
|
|
|
import pytest
|
|
|
|
from media_platform.tieba.client import BaiduTieBaClient
|
|
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
|
|
|
|
|
class DummyPage:
|
|
url = "https://tieba.baidu.com/"
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_search_uses_requested_page_number():
|
|
client = BaiduTieBaClient(playwright_page=DummyPage())
|
|
calls = []
|
|
|
|
async def fake_fetch(uri, method="GET", params=None, data=None, use_sign=False):
|
|
calls.append((uri, params))
|
|
return {"no": 0, "data": {"card_list": []}}
|
|
|
|
client._fetch_json_by_browser = fake_fetch
|
|
|
|
await client.get_notes_by_keyword("编程兼职", page=2, page_size=10)
|
|
|
|
assert calls[0][0] == "/mo/q/search/multsearch"
|
|
assert calls[0][1]["pn"] == 2
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_comments_walk_pages_until_total_reply_page():
|
|
client = BaiduTieBaClient(playwright_page=DummyPage())
|
|
pages = []
|
|
note = TiebaNote(
|
|
note_id="9835114923",
|
|
title="title",
|
|
note_url="https://tieba.baidu.com/p/9835114923",
|
|
tieba_name="加工中心吧",
|
|
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
|
|
total_replay_page=2,
|
|
)
|
|
|
|
async def fake_get_page_data(note_id, page=1):
|
|
pages.append(page)
|
|
return {"forum": {"id": 1, "name": "加工中心"}, "post_list": []}
|
|
|
|
def fake_extract_comments(api_data, note_detail):
|
|
page = pages[-1]
|
|
return [
|
|
TiebaComment(
|
|
comment_id=str(page),
|
|
content="comment",
|
|
note_id=note_detail.note_id,
|
|
note_url=note_detail.note_url,
|
|
tieba_id="1",
|
|
tieba_name=note_detail.tieba_name,
|
|
tieba_link=note_detail.tieba_link,
|
|
)
|
|
]
|
|
|
|
client._get_pc_page_data = fake_get_page_data
|
|
client._page_extractor.extract_tieba_note_parent_comments_from_api = fake_extract_comments
|
|
|
|
await client.get_note_all_comments(note, crawl_interval=0, max_count=10)
|
|
|
|
assert pages == [1, 2]
|
|
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_creator_feed_walks_until_has_more_false(monkeypatch):
|
|
client = BaiduTieBaClient(playwright_page=DummyPage())
|
|
pages = []
|
|
|
|
async def fake_get_notes_by_creator_portrait(portrait, page_number, page_size=20):
|
|
pages.append(page_number)
|
|
return {
|
|
"error_code": 0,
|
|
"data": {
|
|
"has_more": 1 if page_number == 1 else 0,
|
|
"list": [
|
|
{
|
|
"thread_info": {
|
|
"id": str(1000 + page_number),
|
|
"tid": str(1000 + page_number),
|
|
}
|
|
}
|
|
],
|
|
},
|
|
}
|
|
|
|
async def fake_get_note_by_id(note_id):
|
|
return TiebaNote(
|
|
note_id=note_id,
|
|
title="title",
|
|
note_url=f"https://tieba.baidu.com/p/{note_id}",
|
|
tieba_name="加工中心吧",
|
|
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
|
|
)
|
|
|
|
async def fake_sleep(_):
|
|
return None
|
|
|
|
client.get_notes_by_creator_portrait = fake_get_notes_by_creator_portrait
|
|
client.get_note_by_id = fake_get_note_by_id
|
|
monkeypatch.setattr("media_platform.tieba.client.asyncio.sleep", fake_sleep)
|
|
|
|
notes = await client.get_all_notes_by_creator_url("tb.1.creator", crawl_interval=0)
|
|
|
|
assert pages == [1, 2]
|
|
assert [note.note_id for note in notes] == ["1001", "1002"]
|