mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-11 21:17:38 +08:00
fix: restore Tieba crawling after PC page rewrite
Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode. Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign Confidence: high Scope-risk: moderate Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3 Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
This commit is contained in:
62
tests/test_cmd_arg_tieba.py
Normal file
62
tests/test_cmd_arg_tieba.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import config
|
||||
import pytest
|
||||
from cmd_arg import parse_cmd
|
||||
from media_platform.tieba import TieBaCrawler
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_detail_cli_sets_specified_ids():
|
||||
await parse_cmd(
|
||||
[
|
||||
"--platform",
|
||||
"tieba",
|
||||
"--type",
|
||||
"detail",
|
||||
"--specified_id",
|
||||
"https://tieba.baidu.com/p/10451142633,9835114923",
|
||||
]
|
||||
)
|
||||
|
||||
assert config.TIEBA_SPECIFIED_ID_LIST == ["10451142633", "9835114923"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_creator_cli_sets_creator_urls():
|
||||
await parse_cmd(
|
||||
[
|
||||
"--platform",
|
||||
"tieba",
|
||||
"--type",
|
||||
"creator",
|
||||
"--creator_id",
|
||||
"tb.1.example,https://tieba.baidu.com/home/main?id=tb.1.raw",
|
||||
]
|
||||
)
|
||||
|
||||
assert config.TIEBA_CREATOR_URL_LIST == [
|
||||
"https://tieba.baidu.com/home/main?id=tb.1.example",
|
||||
"https://tieba.baidu.com/home/main?id=tb.1.raw",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_detail_reads_runtime_specified_ids(monkeypatch):
|
||||
crawler = TieBaCrawler()
|
||||
seen_note_ids = []
|
||||
|
||||
async def fake_get_note_detail(note_id, semaphore):
|
||||
seen_note_ids.append(note_id)
|
||||
return None
|
||||
|
||||
async def fake_batch_get_comments(note_details):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(config, "TIEBA_SPECIFIED_ID_LIST", ["10451142633"])
|
||||
monkeypatch.setattr(crawler, "get_note_detail_async_task", fake_get_note_detail)
|
||||
monkeypatch.setattr(crawler, "batch_get_note_comments", fake_batch_get_comments)
|
||||
|
||||
await crawler.get_specified_notes()
|
||||
|
||||
assert seen_note_ids == ["10451142633"]
|
||||
110
tests/test_tieba_client_pagination.py
Normal file
110
tests/test_tieba_client_pagination.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import pytest
|
||||
|
||||
from media_platform.tieba.client import BaiduTieBaClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
|
||||
|
||||
class DummyPage:
|
||||
url = "https://tieba.baidu.com/"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_uses_requested_page_number():
|
||||
client = BaiduTieBaClient(playwright_page=DummyPage())
|
||||
calls = []
|
||||
|
||||
async def fake_fetch(uri, method="GET", params=None, data=None, use_sign=False):
|
||||
calls.append((uri, params))
|
||||
return {"no": 0, "data": {"card_list": []}}
|
||||
|
||||
client._fetch_json_by_browser = fake_fetch
|
||||
|
||||
await client.get_notes_by_keyword("编程兼职", page=2, page_size=10)
|
||||
|
||||
assert calls[0][0] == "/mo/q/search/multsearch"
|
||||
assert calls[0][1]["pn"] == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_comments_walk_pages_until_total_reply_page():
|
||||
client = BaiduTieBaClient(playwright_page=DummyPage())
|
||||
pages = []
|
||||
note = TiebaNote(
|
||||
note_id="9835114923",
|
||||
title="title",
|
||||
note_url="https://tieba.baidu.com/p/9835114923",
|
||||
tieba_name="加工中心吧",
|
||||
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
|
||||
total_replay_page=2,
|
||||
)
|
||||
|
||||
async def fake_get_page_data(note_id, page=1):
|
||||
pages.append(page)
|
||||
return {"forum": {"id": 1, "name": "加工中心"}, "post_list": []}
|
||||
|
||||
def fake_extract_comments(api_data, note_detail):
|
||||
page = pages[-1]
|
||||
return [
|
||||
TiebaComment(
|
||||
comment_id=str(page),
|
||||
content="comment",
|
||||
note_id=note_detail.note_id,
|
||||
note_url=note_detail.note_url,
|
||||
tieba_id="1",
|
||||
tieba_name=note_detail.tieba_name,
|
||||
tieba_link=note_detail.tieba_link,
|
||||
)
|
||||
]
|
||||
|
||||
client._get_pc_page_data = fake_get_page_data
|
||||
client._page_extractor.extract_tieba_note_parent_comments_from_api = fake_extract_comments
|
||||
|
||||
await client.get_note_all_comments(note, crawl_interval=0, max_count=10)
|
||||
|
||||
assert pages == [1, 2]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_creator_feed_walks_until_has_more_false(monkeypatch):
|
||||
client = BaiduTieBaClient(playwright_page=DummyPage())
|
||||
pages = []
|
||||
|
||||
async def fake_get_notes_by_creator_portrait(portrait, page_number, page_size=20):
|
||||
pages.append(page_number)
|
||||
return {
|
||||
"error_code": 0,
|
||||
"data": {
|
||||
"has_more": 1 if page_number == 1 else 0,
|
||||
"list": [
|
||||
{
|
||||
"thread_info": {
|
||||
"id": str(1000 + page_number),
|
||||
"tid": str(1000 + page_number),
|
||||
}
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
async def fake_get_note_by_id(note_id):
|
||||
return TiebaNote(
|
||||
note_id=note_id,
|
||||
title="title",
|
||||
note_url=f"https://tieba.baidu.com/p/{note_id}",
|
||||
tieba_name="加工中心吧",
|
||||
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
|
||||
)
|
||||
|
||||
async def fake_sleep(_):
|
||||
return None
|
||||
|
||||
client.get_notes_by_creator_portrait = fake_get_notes_by_creator_portrait
|
||||
client.get_note_by_id = fake_get_note_by_id
|
||||
monkeypatch.setattr("media_platform.tieba.client.asyncio.sleep", fake_sleep)
|
||||
|
||||
notes = await client.get_all_notes_by_creator_url("tb.1.creator", crawl_interval=0)
|
||||
|
||||
assert pages == [1, 2]
|
||||
assert [note.note_id for note in notes] == ["1001", "1002"]
|
||||
278
tests/test_tieba_extractor.py
Normal file
278
tests/test_tieba_extractor.py
Normal file
@@ -0,0 +1,278 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from media_platform.tieba.help import TieBaExtractor
|
||||
from model.m_baidu_tieba import TiebaComment
|
||||
|
||||
|
||||
FIXTURE_DIR = Path(__file__).parent.parent / "media_platform" / "tieba" / "test_data"
|
||||
|
||||
|
||||
def read_fixture(name: str) -> str:
|
||||
return (FIXTURE_DIR / name).read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_extract_search_note_list_from_keyword_page():
|
||||
notes = TieBaExtractor.extract_search_note_list(read_fixture("search_keyword_notes.html"))
|
||||
|
||||
assert len(notes) == 10
|
||||
assert notes[0].note_id == "9117888152"
|
||||
assert notes[0].title.startswith("武汉交互空间科技")
|
||||
assert notes[0].tieba_name == "武汉交互空间"
|
||||
assert notes[0].user_nickname == "VR虚拟达人"
|
||||
|
||||
|
||||
def test_extract_search_note_list_from_current_pc_card_page():
|
||||
page_content = """
|
||||
<html>
|
||||
<body>
|
||||
<div class="threadcardclass thread-new3 index-feed-cards">
|
||||
<a class="action-link-bg" href="https://tieba.baidu.com/p/10559655942?fr=undefined"></a>
|
||||
<div class="thread-forum-name display-flex align-center">
|
||||
<span class="forum-name-text">诸城吧</span>
|
||||
</div>
|
||||
<div class="top-title">
|
||||
<span class="forum-attention user">754023117</span>
|
||||
<span>发布于 2026-3-15</span>
|
||||
</div>
|
||||
<div class="title-wrap"><span>数,英,编程老师</span></div>
|
||||
<div class="abstract-wrap">
|
||||
<span>培训班需求,数学,英语,编程老师,专职兼职都可</span>
|
||||
</div>
|
||||
<a class="comment-link-zone" href="https://tieba.baidu.com/p/10559655942?showComment=1">
|
||||
<span class="action-number">19</span>
|
||||
</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
notes = TieBaExtractor.extract_search_note_list(page_content)
|
||||
|
||||
assert len(notes) == 1
|
||||
assert notes[0].note_id == "10559655942"
|
||||
assert notes[0].title == "数,英,编程老师"
|
||||
assert notes[0].desc == "培训班需求,数学,英语,编程老师,专职兼职都可"
|
||||
assert notes[0].tieba_name == "诸城吧"
|
||||
assert notes[0].tieba_link.endswith("kw=%E8%AF%B8%E5%9F%8E")
|
||||
assert notes[0].user_nickname == "754023117"
|
||||
assert notes[0].publish_time == "2026-3-15"
|
||||
assert notes[0].total_replay_num == 19
|
||||
|
||||
|
||||
def test_extract_search_note_list_from_current_pc_api():
|
||||
api_data = {
|
||||
"no": 0,
|
||||
"error": "success",
|
||||
"data": {
|
||||
"card_list": [
|
||||
{"cardInfo": "related_user", "cardStyle": "related_user", "data": {}},
|
||||
{
|
||||
"cardInfo": "thread",
|
||||
"cardStyle": "thread",
|
||||
"data": {
|
||||
"tid": "10559655942",
|
||||
"title": "数,英,编程老师",
|
||||
"content": "培训班需求,数学,英语,编程老师,专职兼职都可",
|
||||
"time": 1773552643,
|
||||
"user": {
|
||||
"show_nickname": "754023117",
|
||||
"portrait": "https://example.com/avatar.jpg",
|
||||
},
|
||||
"post_num": 19,
|
||||
"forum_name": "诸城",
|
||||
},
|
||||
},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
notes = TieBaExtractor().extract_search_note_list_from_api(api_data)
|
||||
|
||||
assert len(notes) == 1
|
||||
assert notes[0].note_id == "10559655942"
|
||||
assert notes[0].title == "数,英,编程老师"
|
||||
assert notes[0].tieba_name == "诸城吧"
|
||||
assert notes[0].total_replay_num == 19
|
||||
assert notes[0].publish_time
|
||||
|
||||
|
||||
def test_extract_note_detail_and_comments_from_current_pc_api():
|
||||
api_data = {
|
||||
"error_code": 0,
|
||||
"thread": {
|
||||
"id": 10451142633,
|
||||
"title": "这X尔斯对比巴尔斯,我只能说ID正确,允许居功自傲",
|
||||
"reply_num": 15,
|
||||
"create_time": 1769951446,
|
||||
},
|
||||
"forum": {"id": 1627732, "name": "dota2"},
|
||||
"page": {"total_page": 1},
|
||||
"first_floor": {
|
||||
"id": 153154064746,
|
||||
"author_id": 4089186644,
|
||||
"time": 1769951446,
|
||||
"content": [{"type": 0, "text": "皮队败决处刑德国编程钢琴师兼职数学家"}],
|
||||
},
|
||||
"post_list": [
|
||||
{
|
||||
"id": 153154097267,
|
||||
"author_id": 6614897968,
|
||||
"time": 1769952062,
|
||||
"content": [{"type": 0, "text": "xg现在大树阵容另一个辅助不选控制"}],
|
||||
"sub_post_number": 4,
|
||||
}
|
||||
],
|
||||
"user_list": [
|
||||
{
|
||||
"id": 4089186644,
|
||||
"name_show": "泰高祖蒙斯克",
|
||||
"portrait": "tb.1.f893a7af",
|
||||
"ip_address": "广东",
|
||||
},
|
||||
{
|
||||
"id": 6614897968,
|
||||
"name_show": "期胡希3",
|
||||
"portrait": "tb.1.4d0471d4",
|
||||
"ip_address": "河北",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
extractor = TieBaExtractor()
|
||||
note = extractor.extract_note_detail_from_api(api_data)
|
||||
comments = extractor.extract_tieba_note_parent_comments_from_api(api_data, note)
|
||||
|
||||
assert note.note_id == "10451142633"
|
||||
assert note.title == "这X尔斯对比巴尔斯,我只能说ID正确,允许居功自傲"
|
||||
assert note.desc == "皮队败决处刑德国编程钢琴师兼职数学家"
|
||||
assert note.user_nickname == "泰高祖蒙斯克"
|
||||
assert note.tieba_name == "dota2吧"
|
||||
assert note.total_replay_num == 15
|
||||
assert note.total_replay_page == 1
|
||||
assert note.ip_location == "广东"
|
||||
assert len(comments) == 1
|
||||
assert comments[0].comment_id == "153154097267"
|
||||
assert comments[0].content == "xg现在大树阵容另一个辅助不选控制"
|
||||
assert comments[0].user_nickname == "期胡希3"
|
||||
assert comments[0].sub_comment_count == 4
|
||||
assert comments[0].ip_location == "河北"
|
||||
|
||||
|
||||
def test_extract_creator_info_and_threads_from_current_pc_api():
|
||||
creator_api = {
|
||||
"error_code": 0,
|
||||
"data": {
|
||||
"user": {
|
||||
"id": 3546493137,
|
||||
"name": "拜月教Alice",
|
||||
"name_show": "米米世界大手子",
|
||||
"portrait": "tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA?t=1777543466",
|
||||
"fans_num": 58,
|
||||
"concern_num": 1,
|
||||
"sex": 1,
|
||||
"tb_age": "7.8",
|
||||
"ip_address": "广东",
|
||||
}
|
||||
},
|
||||
}
|
||||
feed_api = {
|
||||
"error_code": 0,
|
||||
"data": {
|
||||
"list": [
|
||||
{"type": 1, "thread_info": {"id": 10208192951, "tid": 10208192951}},
|
||||
{"type": 1, "thread_info": {"id": 9835114923}},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
extractor = TieBaExtractor()
|
||||
creator = extractor.extract_creator_info_from_api(creator_api)
|
||||
thread_ids = extractor.extract_creator_thread_id_list_from_api(feed_api)
|
||||
|
||||
assert creator.user_id == "3546493137"
|
||||
assert creator.user_name == "拜月教Alice"
|
||||
assert creator.nickname == "米米世界大手子"
|
||||
assert creator.fans == 58
|
||||
assert creator.follows == 1
|
||||
assert creator.ip_location == "广东"
|
||||
assert creator.registration_duration == "7.8"
|
||||
assert thread_ids == ["10208192951", "9835114923"]
|
||||
|
||||
|
||||
def test_extract_tieba_note_list_from_current_frs_api():
|
||||
api_data = {
|
||||
"error_code": 0,
|
||||
"forum": {
|
||||
"id": 351091,
|
||||
"name": "加工中心",
|
||||
"tids": "10376710029,10636556989,",
|
||||
},
|
||||
}
|
||||
|
||||
notes = TieBaExtractor().extract_tieba_note_list_from_frs_api(api_data)
|
||||
|
||||
assert [note.note_id for note in notes] == ["10376710029", "10636556989"]
|
||||
assert notes[0].note_url == "https://tieba.baidu.com/p/10376710029"
|
||||
assert notes[0].tieba_name == "加工中心吧"
|
||||
assert notes[0].tieba_link.endswith("kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83")
|
||||
|
||||
|
||||
def test_extract_tieba_note_list_from_bigpipe_thread_page():
|
||||
notes = TieBaExtractor().extract_tieba_note_list(read_fixture("tieba_note_list.html"))
|
||||
|
||||
assert len(notes) == 48
|
||||
assert notes[0].note_id == "9079949995"
|
||||
assert notes[0].title == "盗墓笔记全集+txt小说,已整理"
|
||||
assert notes[0].user_nickname == "公子伯仲"
|
||||
assert notes[0].tieba_name == "盗墓笔记吧"
|
||||
assert notes[0].tieba_link.endswith("kw=%E7%9B%97%E5%A2%93%E7%AC%94%E8%AE%B0&ie=utf-8")
|
||||
|
||||
|
||||
def test_extract_note_detail_from_post_page():
|
||||
note = TieBaExtractor().extract_note_detail(read_fixture("note_detail.html"))
|
||||
|
||||
assert note.note_id == "9117905169"
|
||||
assert note.title == "对于一个父亲来说,这个女儿14岁就死了"
|
||||
assert note.user_nickname == "章景轩"
|
||||
assert note.tieba_name == "以太比特吧"
|
||||
assert note.total_replay_num == 786
|
||||
assert note.total_replay_page == 13
|
||||
assert note.ip_location == "广东"
|
||||
|
||||
|
||||
def test_extract_parent_comments_from_post_page():
|
||||
comments = TieBaExtractor().extract_tieba_note_parment_comments(
|
||||
read_fixture("note_comments.html"),
|
||||
"9119688421",
|
||||
)
|
||||
|
||||
assert len(comments) == 30
|
||||
assert comments[0].comment_id == "150726491368"
|
||||
assert comments[0].content == "中国队第22金!无悬念!"
|
||||
assert comments[0].user_nickname == "heinzfrentzen"
|
||||
assert comments[0].tieba_name == "网球风云吧"
|
||||
assert comments[0].ip_location == "福建"
|
||||
|
||||
|
||||
def test_extract_sub_comments_with_class_token_matching():
|
||||
parent = TiebaComment(
|
||||
comment_id="150726496253",
|
||||
content="parent",
|
||||
note_id="9119688421",
|
||||
note_url="https://tieba.baidu.com/p/9119688421",
|
||||
tieba_id="4513750",
|
||||
tieba_name="网球风云吧",
|
||||
tieba_link="https://tieba.baidu.com/f?kw=%E7%BD%91%E7%90%83%E9%A3%8E%E4%BA%91",
|
||||
)
|
||||
|
||||
comments = TieBaExtractor().extract_tieba_note_sub_comments(
|
||||
read_fixture("note_sub_comments.html"),
|
||||
parent,
|
||||
)
|
||||
|
||||
assert len(comments) >= 10
|
||||
assert comments[0].comment_id
|
||||
assert comments[0].parent_comment_id == parent.comment_id
|
||||
assert comments[0].user_link.startswith("https://tieba.baidu.com/home/main")
|
||||
Reference in New Issue
Block a user