Files
MediaCrawler/tests/test_tieba_extractor.py
程序员阿江(Relakkes) f328ee35b5 fix: restore Tieba crawling after PC page rewrite
Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode.

Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction
Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention
Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign
Confidence: high
Scope-risk: moderate
Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests
Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q
Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py
Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false
Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3
Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false
Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser
Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
2026-04-30 18:20:46 +08:00

279 lines
9.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# -*- coding: utf-8 -*-
from pathlib import Path
from media_platform.tieba.help import TieBaExtractor
from model.m_baidu_tieba import TiebaComment
FIXTURE_DIR = Path(__file__).parent.parent / "media_platform" / "tieba" / "test_data"
def read_fixture(name: str) -> str:
return (FIXTURE_DIR / name).read_text(encoding="utf-8")
def test_extract_search_note_list_from_keyword_page():
notes = TieBaExtractor.extract_search_note_list(read_fixture("search_keyword_notes.html"))
assert len(notes) == 10
assert notes[0].note_id == "9117888152"
assert notes[0].title.startswith("武汉交互空间科技")
assert notes[0].tieba_name == "武汉交互空间"
assert notes[0].user_nickname == "VR虚拟达人"
def test_extract_search_note_list_from_current_pc_card_page():
page_content = """
<html>
<body>
<div class="threadcardclass thread-new3 index-feed-cards">
<a class="action-link-bg" href="https://tieba.baidu.com/p/10559655942?fr=undefined"></a>
<div class="thread-forum-name display-flex align-center">
<span class="forum-name-text">诸城吧</span>
</div>
<div class="top-title">
<span class="forum-attention user">754023117</span>
<span>发布于 2026-3-15</span>
</div>
<div class="title-wrap"><span>数,英,编程老师</span></div>
<div class="abstract-wrap">
<span>培训班需求,数学,英语,编程老师,专职兼职都可</span>
</div>
<a class="comment-link-zone" href="https://tieba.baidu.com/p/10559655942?showComment=1">
<span class="action-number">19</span>
</a>
</div>
</body>
</html>
"""
notes = TieBaExtractor.extract_search_note_list(page_content)
assert len(notes) == 1
assert notes[0].note_id == "10559655942"
assert notes[0].title == "数,英,编程老师"
assert notes[0].desc == "培训班需求,数学,英语,编程老师,专职兼职都可"
assert notes[0].tieba_name == "诸城吧"
assert notes[0].tieba_link.endswith("kw=%E8%AF%B8%E5%9F%8E")
assert notes[0].user_nickname == "754023117"
assert notes[0].publish_time == "2026-3-15"
assert notes[0].total_replay_num == 19
def test_extract_search_note_list_from_current_pc_api():
api_data = {
"no": 0,
"error": "success",
"data": {
"card_list": [
{"cardInfo": "related_user", "cardStyle": "related_user", "data": {}},
{
"cardInfo": "thread",
"cardStyle": "thread",
"data": {
"tid": "10559655942",
"title": "数,英,编程老师",
"content": "培训班需求,数学,英语,编程老师,专职兼职都可",
"time": 1773552643,
"user": {
"show_nickname": "754023117",
"portrait": "https://example.com/avatar.jpg",
},
"post_num": 19,
"forum_name": "诸城",
},
},
]
},
}
notes = TieBaExtractor().extract_search_note_list_from_api(api_data)
assert len(notes) == 1
assert notes[0].note_id == "10559655942"
assert notes[0].title == "数,英,编程老师"
assert notes[0].tieba_name == "诸城吧"
assert notes[0].total_replay_num == 19
assert notes[0].publish_time
def test_extract_note_detail_and_comments_from_current_pc_api():
api_data = {
"error_code": 0,
"thread": {
"id": 10451142633,
"title": "这X尔斯对比巴尔斯我只能说ID正确允许居功自傲",
"reply_num": 15,
"create_time": 1769951446,
},
"forum": {"id": 1627732, "name": "dota2"},
"page": {"total_page": 1},
"first_floor": {
"id": 153154064746,
"author_id": 4089186644,
"time": 1769951446,
"content": [{"type": 0, "text": "皮队败决处刑德国编程钢琴师兼职数学家"}],
},
"post_list": [
{
"id": 153154097267,
"author_id": 6614897968,
"time": 1769952062,
"content": [{"type": 0, "text": "xg现在大树阵容另一个辅助不选控制"}],
"sub_post_number": 4,
}
],
"user_list": [
{
"id": 4089186644,
"name_show": "泰高祖蒙斯克",
"portrait": "tb.1.f893a7af",
"ip_address": "广东",
},
{
"id": 6614897968,
"name_show": "期胡希3",
"portrait": "tb.1.4d0471d4",
"ip_address": "河北",
},
],
}
extractor = TieBaExtractor()
note = extractor.extract_note_detail_from_api(api_data)
comments = extractor.extract_tieba_note_parent_comments_from_api(api_data, note)
assert note.note_id == "10451142633"
assert note.title == "这X尔斯对比巴尔斯我只能说ID正确允许居功自傲"
assert note.desc == "皮队败决处刑德国编程钢琴师兼职数学家"
assert note.user_nickname == "泰高祖蒙斯克"
assert note.tieba_name == "dota2吧"
assert note.total_replay_num == 15
assert note.total_replay_page == 1
assert note.ip_location == "广东"
assert len(comments) == 1
assert comments[0].comment_id == "153154097267"
assert comments[0].content == "xg现在大树阵容另一个辅助不选控制"
assert comments[0].user_nickname == "期胡希3"
assert comments[0].sub_comment_count == 4
assert comments[0].ip_location == "河北"
def test_extract_creator_info_and_threads_from_current_pc_api():
creator_api = {
"error_code": 0,
"data": {
"user": {
"id": 3546493137,
"name": "拜月教Alice",
"name_show": "米米世界大手子",
"portrait": "tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA?t=1777543466",
"fans_num": 58,
"concern_num": 1,
"sex": 1,
"tb_age": "7.8",
"ip_address": "广东",
}
},
}
feed_api = {
"error_code": 0,
"data": {
"list": [
{"type": 1, "thread_info": {"id": 10208192951, "tid": 10208192951}},
{"type": 1, "thread_info": {"id": 9835114923}},
]
},
}
extractor = TieBaExtractor()
creator = extractor.extract_creator_info_from_api(creator_api)
thread_ids = extractor.extract_creator_thread_id_list_from_api(feed_api)
assert creator.user_id == "3546493137"
assert creator.user_name == "拜月教Alice"
assert creator.nickname == "米米世界大手子"
assert creator.fans == 58
assert creator.follows == 1
assert creator.ip_location == "广东"
assert creator.registration_duration == "7.8"
assert thread_ids == ["10208192951", "9835114923"]
def test_extract_tieba_note_list_from_current_frs_api():
api_data = {
"error_code": 0,
"forum": {
"id": 351091,
"name": "加工中心",
"tids": "10376710029,10636556989,",
},
}
notes = TieBaExtractor().extract_tieba_note_list_from_frs_api(api_data)
assert [note.note_id for note in notes] == ["10376710029", "10636556989"]
assert notes[0].note_url == "https://tieba.baidu.com/p/10376710029"
assert notes[0].tieba_name == "加工中心吧"
assert notes[0].tieba_link.endswith("kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83")
def test_extract_tieba_note_list_from_bigpipe_thread_page():
notes = TieBaExtractor().extract_tieba_note_list(read_fixture("tieba_note_list.html"))
assert len(notes) == 48
assert notes[0].note_id == "9079949995"
assert notes[0].title == "盗墓笔记全集+txt小说已整理"
assert notes[0].user_nickname == "公子伯仲"
assert notes[0].tieba_name == "盗墓笔记吧"
assert notes[0].tieba_link.endswith("kw=%E7%9B%97%E5%A2%93%E7%AC%94%E8%AE%B0&ie=utf-8")
def test_extract_note_detail_from_post_page():
note = TieBaExtractor().extract_note_detail(read_fixture("note_detail.html"))
assert note.note_id == "9117905169"
assert note.title == "对于一个父亲来说这个女儿14岁就死了"
assert note.user_nickname == "章景轩"
assert note.tieba_name == "以太比特吧"
assert note.total_replay_num == 786
assert note.total_replay_page == 13
assert note.ip_location == "广东"
def test_extract_parent_comments_from_post_page():
comments = TieBaExtractor().extract_tieba_note_parment_comments(
read_fixture("note_comments.html"),
"9119688421",
)
assert len(comments) == 30
assert comments[0].comment_id == "150726491368"
assert comments[0].content == "中国队第22金无悬念"
assert comments[0].user_nickname == "heinzfrentzen"
assert comments[0].tieba_name == "网球风云吧"
assert comments[0].ip_location == "福建"
def test_extract_sub_comments_with_class_token_matching():
parent = TiebaComment(
comment_id="150726496253",
content="parent",
note_id="9119688421",
note_url="https://tieba.baidu.com/p/9119688421",
tieba_id="4513750",
tieba_name="网球风云吧",
tieba_link="https://tieba.baidu.com/f?kw=%E7%BD%91%E7%90%83%E9%A3%8E%E4%BA%91",
)
comments = TieBaExtractor().extract_tieba_note_sub_comments(
read_fixture("note_sub_comments.html"),
parent,
)
assert len(comments) >= 10
assert comments[0].comment_id
assert comments[0].parent_comment_id == parent.comment_id
assert comments[0].user_link.startswith("https://tieba.baidu.com/home/main")