fix: restore Tieba crawling after PC page rewrite

Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode.

Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction
Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention
Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign
Confidence: high
Scope-risk: moderate
Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests
Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q
Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py
Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false
Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3
Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false
Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser
Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
This commit is contained in:
程序员阿江(Relakkes)
2026-04-30 18:20:46 +08:00
parent 1572b64334
commit f328ee35b5
7 changed files with 1308 additions and 176 deletions

View File

@@ -22,6 +22,7 @@ from __future__ import annotations
import sys
import re
from enum import Enum
from types import SimpleNamespace
from typing import Iterable, Optional, Sequence, Type, TypeVar
@@ -135,6 +136,21 @@ def _inject_init_db_default(args: Sequence[str]) -> list[str]:
return normalized
def _normalize_tieba_note_id(value: str) -> str:
"""Accept a raw Tieba thread id or a /p/<id> URL."""
value = value.strip()
match = re.search(r"/p/(\d+)", value)
return match.group(1) if match else value
def _normalize_tieba_creator_url(value: str) -> str:
"""Accept a Tieba creator homepage URL or a portrait id."""
value = value.strip()
if value.startswith("http://") or value.startswith("https://"):
return value
return f"https://tieba.baidu.com/home/main?id={value}"
async def parse_cmd(argv: Optional[Sequence[str]] = None):
"""Parse command line arguments using Typer."""
@@ -344,6 +360,10 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.WEIBO_SPECIFIED_ID_LIST = specified_id_list
elif platform == PlatformEnum.KUAISHOU:
config.KS_SPECIFIED_ID_LIST = specified_id_list
elif platform == PlatformEnum.TIEBA:
config.TIEBA_SPECIFIED_ID_LIST = [
_normalize_tieba_note_id(item) for item in specified_id_list
]
if creator_id_list:
if platform == PlatformEnum.XHS:
@@ -356,6 +376,10 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.WEIBO_CREATOR_ID_LIST = creator_id_list
elif platform == PlatformEnum.KUAISHOU:
config.KS_CREATOR_ID_LIST = creator_id_list
elif platform == PlatformEnum.TIEBA:
config.TIEBA_CREATOR_URL_LIST = [
_normalize_tieba_creator_url(item) for item in creator_id_list
]
return SimpleNamespace(
platform=config.PLATFORM,

View File

@@ -18,9 +18,10 @@
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
import asyncio
import hashlib
import json
from typing import Any, Callable, Dict, List, Optional, Union
from urllib.parse import urlencode, quote
from urllib.parse import urlencode, quote, parse_qs, unquote, urlparse
import requests
from playwright.async_api import BrowserContext, Page
@@ -35,6 +36,8 @@ from tools import utils
from .field import SearchNoteType, SearchSortType
from .help import TieBaExtractor
PC_SIGN_SECRET = "36770b1f34c9bbf2e7d1a99d2b82fa9e"
class BaiduTieBaClient(AbstractApiClient):
@@ -58,6 +61,128 @@ class BaiduTieBaClient(AbstractApiClient):
self._page_extractor = TieBaExtractor()
self.default_ip_proxy = default_ip_proxy
self.playwright_page = playwright_page # Playwright page object
self._pc_tbs = ""
@staticmethod
def _sign_pc_params(params: Dict[str, Any]) -> str:
sign_text = ""
for key in sorted(params):
if key in {"sign", "sig"} or params[key] is None:
continue
sign_text += f"{key}={params[key]}"
sign_text += PC_SIGN_SECRET
return hashlib.md5(sign_text.encode("utf-8")).hexdigest()
async def _ensure_tieba_origin(self) -> None:
if not self.playwright_page:
raise Exception("playwright_page is required for tieba PC API requests")
if not self.playwright_page.url.startswith(self._host):
await self.playwright_page.goto(self._host, wait_until="domcontentloaded")
async def _fetch_json_by_browser(
self,
uri: str,
method: str = "GET",
params: Optional[Dict[str, Any]] = None,
data: Optional[Dict[str, Any]] = None,
use_sign: bool = False,
) -> Dict:
"""
Fetch current Tieba PC JSON APIs from the browser context.
These APIs rely on logged-in browser cookies and Baidu's PC signing
convention, while Python requests can be blocked by local proxy/TLS.
"""
await self._ensure_tieba_origin()
params = {k: v for k, v in (params or {}).items() if v is not None}
data = {k: v for k, v in (data or {}).items() if v is not None}
if use_sign:
sign_source = data if method.upper() == "POST" else params
sign_source.setdefault("subapp_type", "pc")
sign_source.setdefault("_client_type", "20")
sign_source["sign"] = self._sign_pc_params(sign_source)
url = f"{self._host}{uri}"
if params:
url = f"{url}?{urlencode(params)}"
body = urlencode(data) if data else ""
response = await self.playwright_page.evaluate(
"""async ({ url, method, body }) => {
const headers = { "Accept": "application/json, text/plain, */*" };
const options = { method, credentials: "include", headers };
if (method === "POST") {
headers["Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8";
options.body = body;
}
const resp = await fetch(url, options);
const text = await resp.text();
return { status: resp.status, text };
}""",
{"url": url, "method": method.upper(), "body": body},
)
if response["status"] != 200:
raise Exception(f"Tieba PC API failed, status={response['status']}, url={url}")
try:
json_data = json.loads(response["text"])
except json.JSONDecodeError as exc:
raise Exception(f"Tieba PC API returned non-JSON, url={url}, body={response['text'][:500]}") from exc
error_code = json_data.get("error_code", json_data.get("no", 0))
if str(error_code) not in {"0", "None"}:
raise Exception(f"Tieba PC API error, url={url}, response={json_data}")
return json_data
async def _get_pc_tbs(self) -> str:
if self._pc_tbs:
return self._pc_tbs
sync_data = await self._fetch_json_by_browser(
"/c/s/pc/sync",
params={"subapp_type": "pc", "_client_type": "20"},
use_sign=True,
)
self._pc_tbs = (
sync_data.get("data", {})
.get("anti", {})
.get("tbs", "")
)
if not self._pc_tbs:
raise Exception(f"Can not get Tieba tbs from pc sync API: {sync_data}")
return self._pc_tbs
async def _get_pc_page_data(self, note_id: str, page: int = 1) -> Dict:
tbs = await self._get_pc_tbs()
return await self._fetch_json_by_browser(
"/c/f/pb/page_pc",
method="POST",
data={
"pn": page,
"lz": 0,
"r": 2,
"mark_type": 0,
"back": 0,
"fr": "",
"kz": note_id,
"session_request_times": 1,
"tbs": tbs,
"subapp_type": "pc",
"_client_type": "20",
},
use_sign=True,
)
@staticmethod
def _extract_creator_portrait(creator_url: str) -> str:
creator_url = (creator_url or "").strip()
if not creator_url:
return ""
if not creator_url.startswith(("http://", "https://")):
return creator_url.split("?")[0]
parsed = urlparse(creator_url)
query = parse_qs(parsed.query)
portrait = (
query.get("id", [""])[0]
or query.get("portrait", [""])[0]
or query.get("un", [""])[0]
)
return unquote(portrait).split("?")[0]
def _sync_request(self, method, url, proxy=None, **kwargs):
"""
@@ -270,35 +395,29 @@ class BaiduTieBaClient(AbstractApiClient):
utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based search")
# Construct search URL
# Example: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=keyword
search_url = f"{self._host}/f/search/res"
params = {
"ie": "utf-8",
"qw": keyword,
"rn": page_size,
"rn": max(page_size, 20),
"st": sort.value,
"word": keyword,
"needbrand": 1,
"sug_type": 2,
"pn": page,
"sm": sort.value,
"only_thread": note_type.value,
"come_from": "search",
"subapp_type": "pc",
"_client_type": "20",
}
# Concatenate full URL
full_url = f"{search_url}?{urlencode(params)}"
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Accessing search page: {full_url}")
utils.logger.info(
f"[BaiduTieBaClient.get_notes_by_keyword] Accessing search API: "
f"{self._host}/mo/q/search/multsearch?{urlencode(params)}"
)
try:
# Use Playwright to access search page
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
# Wait for page loading, using delay setting from config file
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# Get page HTML content
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Successfully retrieved search page HTML, length: {len(page_content)}")
# Extract search results
notes = self._page_extractor.extract_search_note_list(page_content)
api_data = await self._fetch_json_by_browser(
"/mo/q/search/multsearch",
params=params,
use_sign=True,
)
notes = self._page_extractor.extract_search_note_list_from_api(api_data)[:page_size]
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Extracted {len(notes)} posts")
return notes
@@ -319,23 +438,11 @@ class BaiduTieBaClient(AbstractApiClient):
utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based note detail fetching")
# Construct post detail URL
note_url = f"{self._host}/p/{note_id}"
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Accessing post detail page: {note_url}")
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Accessing post detail API, note_id: {note_id}")
try:
# Use Playwright to access post detail page
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
# Wait for page loading, using delay setting from config file
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# Get page HTML content
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Successfully retrieved post detail HTML, length: {len(page_content)}")
# Extract post details
note_detail = self._page_extractor.extract_note_detail(page_content)
api_data = await self._get_pc_page_data(note_id=note_id, page=1)
note_detail = self._page_extractor.extract_note_detail_from_api(api_data)
return note_detail
except Exception as e:
@@ -367,23 +474,15 @@ class BaiduTieBaClient(AbstractApiClient):
current_page = 1
while note_detail.total_replay_page >= current_page and len(result) < max_count:
# Construct comment page URL
comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}"
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Accessing comment page: {comment_url}")
utils.logger.info(
f"[BaiduTieBaClient.get_note_all_comments] Accessing comment API, "
f"note_id: {note_detail.note_id}, page: {current_page}"
)
try:
# Use Playwright to access comment page
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
# Wait for page loading, using delay setting from config file
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# Get page HTML content
page_content = await self.playwright_page.content()
# Extract comments
comments = self._page_extractor.extract_tieba_note_parment_comments(
page_content, note_id=note_detail.note_id
api_data = await self._get_pc_page_data(note_id=note_detail.note_id, page=current_page)
comments = self._page_extractor.extract_tieba_note_parent_comments_from_api(
api_data, note_detail=note_detail
)
if not comments:
@@ -498,7 +597,7 @@ class BaiduTieBaClient(AbstractApiClient):
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
"""
Get post list by Tieba name (uses Playwright to access page, avoiding API detection)
Get post list by Tieba name from current PC forum JSON API.
Args:
tieba_name: Tieba name
page_num: Page number
@@ -510,23 +609,33 @@ class BaiduTieBaClient(AbstractApiClient):
utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based tieba note fetching")
# Construct Tieba post list URL
tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}"
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Accessing Tieba page: {tieba_url}")
page_size = 30
api_page = page_num // page_size + 1
tbs = await self._get_pc_tbs()
utils.logger.info(
f"[BaiduTieBaClient.get_notes_by_tieba_name] Accessing Tieba FRS API, "
f"tieba_name: {tieba_name}, page: {api_page}"
)
try:
# Use Playwright to access Tieba page
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
# Wait for page loading, using delay setting from config file
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# Get page HTML content
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Successfully retrieved Tieba page HTML, length: {len(page_content)}")
# Extract post list
notes = self._page_extractor.extract_tieba_note_list(page_content)
api_data = await self._fetch_json_by_browser(
"/c/f/frs/page_pc",
method="POST",
data={
"kw": quote(tieba_name),
"pn": api_page,
"sort_type": -1,
"is_newfrs": 1,
"is_newfeed": 1,
"rn": page_size,
"rn_need": 10,
"tbs": tbs,
"subapp_type": "pc",
"_client_type": "20",
},
use_sign=True,
)
notes = self._page_extractor.extract_tieba_note_list_from_frs_api(api_data)[:page_size]
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Extracted {len(notes)} posts")
return notes
@@ -534,38 +643,72 @@ class BaiduTieBaClient(AbstractApiClient):
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] Failed to get Tieba post list: {e}")
raise
async def get_creator_info_by_url(self, creator_url: str) -> str:
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
"""
Get creator information by creator URL (uses Playwright to access page, avoiding API detection)
Get creator information by creator URL from current PC JSON API.
Args:
creator_url: Creator homepage URL
Returns:
str: Page HTML content
TiebaCreator: Creator information
"""
if not self.playwright_page:
utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based creator info fetching")
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Accessing creator homepage: {creator_url}")
portrait = self._extract_creator_portrait(creator_url)
if not portrait:
raise Exception(f"Can not extract Tieba creator portrait from url: {creator_url}")
utils.logger.info(
f"[BaiduTieBaClient.get_creator_info_by_url] Accessing creator info API, portrait: {portrait}"
)
try:
# Use Playwright to access creator homepage
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
# Wait for page loading, using delay setting from config file
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
# Get page HTML content
page_content = await self.playwright_page.content()
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Successfully retrieved creator homepage HTML, length: {len(page_content)}")
return page_content
api_data = await self._fetch_json_by_browser(
"/c/u/pc/homeSidebarRight",
params={
"portrait": portrait,
"un": "",
"subapp_type": "pc",
"_client_type": "20",
},
use_sign=True,
)
return self._page_extractor.extract_creator_info_from_api(api_data)
except Exception as e:
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] Failed to get creator homepage: {e}")
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] Failed to get creator info: {e}")
raise
async def get_notes_by_creator_portrait(
self, portrait: str, page_number: int, page_size: int = 20
) -> Dict:
"""
Get creator's thread feed by creator portrait from current PC JSON API.
"""
if not self.playwright_page:
utils.logger.error("[BaiduTieBaClient.get_notes_by_creator_portrait] playwright_page is None, cannot use browser mode")
raise Exception("playwright_page is required for browser-based creator notes fetching")
utils.logger.info(
f"[BaiduTieBaClient.get_notes_by_creator_portrait] Accessing creator feed API, "
f"portrait: {portrait}, page: {page_number}"
)
return await self._fetch_json_by_browser(
"/c/u/feed/myThread",
params={
"pn": page_number,
"rn": page_size,
"portrait": portrait,
"type": 1,
"un": "",
"subapp_type": "pc",
"_client_type": "20",
},
use_sign=True,
)
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
"""
Get creator's posts by creator (uses Playwright to access page, avoiding API detection)
@@ -648,12 +791,12 @@ class BaiduTieBaClient(AbstractApiClient):
while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count):
notes_res = await self.get_notes_by_creator(user_name, page_number)
if not notes_res or notes_res.get("no") != 0:
utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
utils.logger.error(f"[TieBaClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
break
notes_data = notes_res.get("data")
notes_has_more = notes_data.get("has_more")
notes = notes_data["thread_list"]
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
utils.logger.info(f"[TieBaClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
notes = await asyncio.gather(*note_detail_task)
@@ -664,3 +807,59 @@ class BaiduTieBaClient(AbstractApiClient):
page_number += 1
total_get_count += page_per_count
return result
async def get_all_notes_by_creator_url(
self,
creator_url: str,
crawl_interval: float = 1.0,
callback: Optional[Callable] = None,
max_note_count: int = 0,
) -> List[TiebaNote]:
"""
Get all creator posts by current PC creator feed API.
"""
portrait = self._extract_creator_portrait(creator_url)
if not portrait:
raise Exception(f"Can not extract Tieba creator portrait from url: {creator_url}")
result: List[TiebaNote] = []
page_number = 1
page_size = 20
while max_note_count == 0 or len(result) < max_note_count:
notes_res = await self.get_notes_by_creator_portrait(
portrait=portrait,
page_number=page_number,
page_size=page_size,
)
thread_id_list = self._page_extractor.extract_creator_thread_id_list_from_api(notes_res)
if not thread_id_list:
utils.logger.info(
f"[BaiduTieBaClient.get_all_notes_by_creator_url] "
f"Creator portrait:{portrait} page:{page_number} has no threads"
)
break
if max_note_count:
thread_id_list = thread_id_list[: max_note_count - len(result)]
utils.logger.info(
f"[BaiduTieBaClient.get_all_notes_by_creator_url] "
f"got portrait:{portrait} thread ids len: {len(thread_id_list)}"
)
note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list]
notes = await asyncio.gather(*note_detail_task)
notes = [note for note in notes if note]
if callback and notes:
await callback(notes)
result.extend(notes)
data = notes_res.get("data", {})
has_more = int(data.get("has_more") or 0)
if not has_more:
break
await asyncio.sleep(crawl_interval)
page_number += 1
return result

View File

@@ -213,7 +213,7 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
tieba_limit_count = 50
tieba_limit_count = 30
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
for tieba_name in config.TIEBA_NAME_LIST:
@@ -245,7 +245,7 @@ class TieBaCrawler(AbstractCrawler):
page_number += tieba_limit_count
async def get_specified_notes(
self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
self, note_id_list: Optional[List[str]] = None
):
"""
Get the information and comments of the specified post
@@ -255,6 +255,8 @@ class TieBaCrawler(AbstractCrawler):
Returns:
"""
if note_id_list is None:
note_id_list = config.TIEBA_SPECIFIED_ID_LIST
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
task_list = [
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
@@ -365,18 +367,15 @@ class TieBaCrawler(AbstractCrawler):
"""
utils.logger.info(
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
"[TieBaCrawler.get_creators_and_notes] Begin get tieba creators"
)
for creator_url in config.TIEBA_CREATOR_URL_LIST:
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(
creator_url=creator_url
)
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
creator_page_html_content
)
if creator_info:
utils.logger.info(
f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
f"[TieBaCrawler.get_creators_and_notes] creator info: {creator_info}"
)
if not creator_info:
raise Exception("Get creator info error")
@@ -385,12 +384,11 @@ class TieBaCrawler(AbstractCrawler):
# Get all note information of the creator
all_notes_list = (
await self.tieba_client.get_all_notes_by_creator_user_name(
user_name=creator_info.user_name,
await self.tieba_client.get_all_notes_by_creator_url(
creator_url=creator_url,
crawl_interval=0,
callback=tieba_store.batch_update_tieba_notes,
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
creator_page_html_content=creator_page_html_content,
)
)
@@ -398,7 +396,7 @@ class TieBaCrawler(AbstractCrawler):
else:
utils.logger.error(
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
f"[TieBaCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
)
async def _navigate_to_tieba_via_baidu(self):

View File

@@ -22,8 +22,8 @@
import html
import json
import re
from typing import Dict, List, Tuple
from urllib.parse import parse_qs, unquote
from typing import Any, Dict, List, Tuple
from urllib.parse import parse_qs, quote, unquote, urljoin
from parsel import Selector
@@ -39,6 +39,306 @@ class TieBaExtractor:
def __init__(self):
pass
@staticmethod
def _class_contains(class_name: str) -> str:
return f"contains(concat(' ', normalize-space(@class), ' '), ' {class_name} ')"
@staticmethod
def _normalize_text(text: str) -> str:
return re.sub(r"\s+", " ", text or "").strip()
@classmethod
def _selector_text(cls, selector: Selector, xpath: str) -> str:
node = selector.xpath(xpath)
if not node:
return ""
return cls._normalize_text(node[0].xpath("string(.)").get(default=""))
@staticmethod
def _absolute_url(url: str) -> str:
return urljoin(const.TIEBA_URL, (url or "").strip())
@staticmethod
def _extract_note_id_from_url(url: str) -> str:
note_id_match = re.search(r"/p/(\d+)", url or "")
return note_id_match.group(1) if note_id_match else ""
@staticmethod
def _text_to_int(text: str) -> int:
match = re.search(r"\d+", text or "")
return int(match.group(0)) if match else 0
@staticmethod
def _ensure_tieba_suffix(tieba_name: str) -> str:
tieba_name = (tieba_name or "").strip()
return tieba_name if not tieba_name or tieba_name.endswith("") else f"{tieba_name}"
@classmethod
def _tieba_link_from_name(cls, tieba_name: str) -> str:
if not tieba_name:
return const.TIEBA_URL
return f"{const.TIEBA_URL}/f?kw={quote(tieba_name.removesuffix(''))}"
@classmethod
def _extract_api_content_text(cls, content: Any) -> str:
if isinstance(content, str):
return cls._normalize_text(content)
if not isinstance(content, list):
return ""
text_list: List[str] = []
for item in content:
if not isinstance(item, dict):
continue
text = item.get("text") or item.get("c") or ""
if text:
text_list.append(str(text))
return cls._normalize_text("".join(text_list))
@staticmethod
def _api_user_map(api_data: Dict) -> Dict[str, Dict]:
return {str(user.get("id")): user for user in api_data.get("user_list", []) if user.get("id")}
@staticmethod
def _api_user_link(user: Dict) -> str:
portrait = (user or {}).get("portrait", "")
if not portrait:
return ""
return f"{const.TIEBA_URL}/home/main?id={quote(str(portrait))}"
@staticmethod
def _api_user_avatar(user: Dict) -> str:
image_data = (
(user or {})
.get("user_show_info", {})
.get("feed_head", {})
.get("image_data", {})
)
return image_data.get("img_url") or (
"https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/"
f"{user.get('portrait', '')}"
if user and user.get("portrait")
else ""
)
def extract_search_note_list_from_api(self, api_data: Dict) -> List[TiebaNote]:
"""
Extract Tieba post list from current PC search JSON API.
"""
result: List[TiebaNote] = []
cards = api_data.get("data", {}).get("card_list", [])
for card in cards:
if card.get("cardInfo") != "thread" and card.get("cardStyle") != "thread":
continue
item = card.get("data") or {}
note_id = str(item.get("tid") or "")
if not note_id:
continue
user = item.get("user") or {}
tieba_name = self._ensure_tieba_suffix(item.get("forum_name") or "")
tieba_note = TiebaNote(
note_id=note_id,
title=self._normalize_text(item.get("title") or ""),
desc=self._normalize_text(item.get("content") or ""),
note_url=f"{const.TIEBA_URL}/p/{note_id}",
publish_time=utils.get_time_str_from_unix_time(
item.get("time") or item.get("create_time") or 0
),
user_link="",
user_nickname=user.get("show_nickname") or user.get("user_name") or "",
user_avatar=user.get("portrait") or user.get("portraith") or "",
tieba_name=tieba_name,
tieba_link=self._tieba_link_from_name(tieba_name),
total_replay_num=item.get("post_num") or 0,
)
result.append(tieba_note)
return result
def extract_note_detail_from_api(self, api_data: Dict) -> TiebaNote:
"""
Extract Tieba post detail from current PC page_pc JSON API.
"""
thread = api_data.get("thread") or {}
first_floor = api_data.get("first_floor") or {}
forum = api_data.get("forum") or api_data.get("display_forum") or {}
page = api_data.get("page") or {}
user_map = self._api_user_map(api_data)
author = user_map.get(str(first_floor.get("author_id"))) or {}
note_id = str(thread.get("id") or thread.get("tid") or first_floor.get("tid") or "")
tieba_name = self._ensure_tieba_suffix(forum.get("name") or "")
note = TiebaNote(
note_id=note_id,
title=self._clean_title(thread.get("title") or first_floor.get("title") or "", tieba_name),
desc=self._extract_api_content_text(
first_floor.get("content")
or thread.get("origin_thread_info", {}).get("abstract")
or thread.get("origin_thread_info", {}).get("content")
),
note_url=f"{const.TIEBA_URL}/p/{note_id}",
publish_time=utils.get_time_str_from_unix_time(
first_floor.get("time") or thread.get("create_time") or 0
),
user_link=self._api_user_link(author),
user_nickname=author.get("name_show") or author.get("name") or "",
user_avatar=self._api_user_avatar(author),
tieba_name=tieba_name,
tieba_link=self._tieba_link_from_name(tieba_name),
total_replay_num=thread.get("reply_num") or 0,
total_replay_page=page.get("total_page") or 0,
ip_location=author.get("ip_address") or "",
)
return note
def extract_tieba_note_parent_comments_from_api(
self, api_data: Dict, note_detail: TiebaNote
) -> List[TiebaComment]:
"""
Extract first-level comments from current PC page_pc JSON API.
"""
forum = api_data.get("forum") or api_data.get("display_forum") or {}
tieba_id = str(forum.get("id") or "")
tieba_name = note_detail.tieba_name or self._ensure_tieba_suffix(forum.get("name") or "")
tieba_link = note_detail.tieba_link or self._tieba_link_from_name(tieba_name)
user_map = self._api_user_map(api_data)
result: List[TiebaComment] = []
for item in api_data.get("post_list", []):
comment_id = str(item.get("id") or "")
if not comment_id:
continue
user = user_map.get(str(item.get("author_id"))) or {}
comment = TiebaComment(
comment_id=comment_id,
sub_comment_count=item.get("sub_post_number") or 0,
content=self._extract_api_content_text(item.get("content")),
note_url=note_detail.note_url,
user_link=self._api_user_link(user),
user_nickname=user.get("name_show") or user.get("name") or "",
user_avatar=self._api_user_avatar(user),
tieba_id=tieba_id,
tieba_name=tieba_name,
tieba_link=tieba_link,
ip_location=user.get("ip_address") or "",
publish_time=utils.get_time_str_from_unix_time(item.get("time") or 0),
note_id=note_detail.note_id,
)
result.append(comment)
return result
def extract_creator_info_from_api(self, api_data: Dict) -> TiebaCreator:
"""
Extract Tieba creator information from current PC creator JSON API.
"""
user = api_data.get("data", {}).get("user", {})
if not user:
raise ValueError(f"Creator API response does not contain user info: {api_data}")
gender_value = user.get("sex", user.get("gender", 0))
gender = "Unknown"
if gender_value == 1:
gender = "Male"
elif gender_value == 2:
gender = "Female"
return TiebaCreator(
user_id=str(user.get("id", "")),
user_name=str(user.get("name", "")),
nickname=str(user.get("name_show") or user.get("name") or ""),
avatar=self._api_user_avatar(user),
gender=gender,
ip_location=str(user.get("ip_address", "")),
follows=int(user.get("concern_num") or 0),
fans=int(user.get("fans_num") or 0),
registration_duration=str(user.get("tb_age", "")),
)
@staticmethod
def extract_creator_thread_id_list_from_api(api_data: Dict) -> List[str]:
"""
Extract creator thread ids from current PC creator feed JSON API.
"""
thread_ids: List[str] = []
for item in api_data.get("data", {}).get("list", []):
thread_info = item.get("thread_info") or {}
thread_id = thread_info.get("tid") or thread_info.get("id")
if thread_id:
thread_ids.append(str(thread_id))
return thread_ids
def extract_tieba_note_list_from_frs_api(self, api_data: Dict) -> List[TiebaNote]:
"""
Extract Tieba thread ids from current PC forum page JSON API.
The by-forum command immediately fetches full details for every id, so
this list intentionally carries only stable routing fields.
"""
forum = api_data.get("forum", {})
tieba_name = self._ensure_tieba_suffix(forum.get("name") or "")
tieba_link = self._tieba_link_from_name(tieba_name)
tids = [
tid.strip()
for tid in str(forum.get("tids") or "").split(",")
if tid.strip()
]
return [
TiebaNote(
note_id=tid,
title="",
desc="",
note_url=f"{const.TIEBA_URL}/p/{tid}",
tieba_name=tieba_name,
tieba_link=tieba_link,
)
for tid in tids
]
@staticmethod
def _decode_js_string(value: str) -> str:
if not value or value == "null":
return ""
try:
decoded_value = json.loads(f'"{value}"')
return decoded_value if isinstance(decoded_value, str) else str(decoded_value)
except Exception:
return value
@classmethod
def _extract_forum_info(cls, selector: Selector, page_content: str) -> Tuple[str, str]:
forum_xpath = f"//a[{cls._class_contains('card_title_fname')}]"
forum_link_selector = selector.xpath(forum_xpath)
tieba_name = cls._selector_text(selector, forum_xpath)
tieba_link = cls._absolute_url(forum_link_selector.xpath("./@href").get(default=""))
if not tieba_name:
patterns = [
r"PageData\.forum\s*=\s*\{.*?['\"]name['\"]\s*:\s*\"([^\"\\\\]*(?:\\\\.[^\"\\\\]*)*)\"",
r'"forum_name"\s*:\s*"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"',
r'"kw"\s*:\s*"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"',
]
for pattern in patterns:
match = re.search(pattern, page_content, re.S)
if match:
tieba_name = cls._decode_js_string(match.group(1))
if tieba_name:
break
if not tieba_name:
title = selector.xpath("//title/text()").get(default="")
match = re.search(r"(.+?)吧[-_]", title)
if match:
tieba_name = cls._normalize_text(match.group(1))
if not tieba_link and tieba_name:
tieba_link = f"{const.TIEBA_URL}/f?kw={quote(tieba_name.removesuffix(''))}"
return tieba_name, tieba_link or const.TIEBA_URL
@classmethod
def _clean_title(cls, title: str, tieba_name: str = "") -> str:
title = cls._normalize_text(title)
title = re.sub(r"_(?:百度贴吧|Baidu Tieba)$", "", title).strip()
for name in {tieba_name, tieba_name.removesuffix("")}:
if name:
title = title.replace(f"{name}", "").strip()
return title
@staticmethod
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
"""
@@ -49,23 +349,115 @@ class TieBaExtractor:
Returns:
List of Tieba post objects
"""
xpath_selector = "//div[@class='s_post']"
post_list = Selector(text=page_content).xpath(xpath_selector)
extractor = TieBaExtractor()
selector = Selector(text=page_content)
post_list = selector.xpath(
f"//div[{extractor._class_contains('s_post')}]"
)
result: List[TiebaNote] = []
for post in post_list:
tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(
default=''),
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(
default='').strip(), user_link=const.TIEBA_URL + post.xpath(
".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(
default=''),
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(
default='').strip(), )
title_link = post.xpath(".//*[contains(@class, 'p_title')]//a[1]")
note_url = extractor._absolute_url(title_link.xpath("./@href").get(default=""))
note_id = title_link.xpath("./@data-tid").get(default="").strip()
if not note_id:
note_id = extractor._extract_note_id_from_url(note_url)
user_selector = post.xpath(".//a[contains(@href, '/home/main')][1]")
forum_selector = post.xpath(f".//a[{extractor._class_contains('p_forum')}][1]")
tieba_note = TiebaNote(
note_id=note_id,
title=extractor._selector_text(post, ".//*[contains(@class, 'p_title')]//a[1]"),
desc=extractor._selector_text(
post, f".//div[{extractor._class_contains('p_content')}]"
),
note_url=note_url,
user_nickname=extractor._selector_text(
post, ".//a[contains(@href, '/home/main')][1]"
),
user_link=extractor._absolute_url(user_selector.xpath("./@href").get(default="")),
tieba_name=extractor._selector_text(
post, f".//a[{extractor._class_contains('p_forum')}][1]"
),
tieba_link=extractor._absolute_url(forum_selector.xpath("./@href").get(default="")),
publish_time=extractor._selector_text(
post, ".//*[contains(@class, 'p_date')][1]"
),
)
result.append(tieba_note)
if result:
return result
# Tieba search changed to a PC feed/card layout in 2026. The old
# s_post nodes disappeared, while each search result now lives in a
# threadcardclass card with overlay links to /p/<thread_id>.
post_list = selector.xpath(
f"//*[contains(concat(' ', normalize-space(@class), ' '), ' threadcardclass ') "
f"and .//a[contains(@href, '/p/')]]"
)
seen_note_ids = set()
for post in post_list:
title_link = post.xpath(
f".//a[{extractor._class_contains('action-link-bg')} and contains(@href, '/p/')][1]"
f"|.//a[contains(@href, '/p/')][1]"
)
note_url = extractor._absolute_url(title_link.xpath("./@href").get(default=""))
note_id = extractor._extract_note_id_from_url(note_url)
if not note_id or note_id in seen_note_ids:
continue
seen_note_ids.add(note_id)
tieba_name = extractor._selector_text(
post, f".//*[{extractor._class_contains('forum-name-text')}][1]"
)
tieba_link = ""
forum_link = post.xpath(".//a[contains(@href, '/f?')][1]/@href").get(default="")
if forum_link:
tieba_link = extractor._absolute_url(forum_link)
elif tieba_name:
tieba_keyword = tieba_name.removesuffix("")
tieba_link = f"{const.TIEBA_URL}/f?kw={quote(tieba_keyword)}"
else:
tieba_link = const.TIEBA_URL
publish_time = ""
top_title_text = extractor._selector_text(
post, f".//*[{extractor._class_contains('top-title')}][1]"
)
publish_match = re.search(r"发布于\s*([^\s]+)", top_title_text)
if publish_match:
publish_time = publish_match.group(1)
title = extractor._selector_text(
post, f".//*[{extractor._class_contains('title-wrap')}][1]"
)
desc = extractor._selector_text(
post, f".//*[{extractor._class_contains('abstract-wrap')}][1]"
)
if not title:
title = extractor._normalize_text(desc[:80])
user_nickname = extractor._selector_text(
post, f".//*[{extractor._class_contains('forum-attention')}][1]"
)
if not user_nickname and publish_time:
user_nickname = extractor._normalize_text(
top_title_text.split("发布于", 1)[0]
)
comment_text = extractor._selector_text(
post, f".//a[{extractor._class_contains('comment-link-zone')}][1]"
)
tieba_note = TiebaNote(
note_id=note_id,
title=title,
desc=desc,
note_url=f"{const.TIEBA_URL}/p/{note_id}",
user_nickname=user_nickname,
user_link="",
tieba_name=tieba_name,
tieba_link=tieba_link,
publish_time=publish_time,
total_replay_num=extractor._text_to_int(comment_text),
)
result.append(tieba_note)
return result
@@ -80,27 +472,39 @@ class TieBaExtractor:
"""
page_content = page_content.replace('<!--', "")
content_selector = Selector(text=page_content)
xpath_selector = "//ul[@id='thread_list']/li"
xpath_selector = f"//ul[@id='thread_list']/li[{self._class_contains('j_thread_list')}]"
post_list = content_selector.xpath(xpath_selector)
tieba_name, tieba_link = self._extract_forum_info(content_selector, page_content)
result: List[TiebaNote] = []
for post_selector in post_list:
post_field_value: Dict = self.extract_data_field_value(post_selector)
if not post_field_value:
continue
note_id = str(post_field_value.get("id"))
tieba_note = TiebaNote(note_id=note_id,
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
desc=post_selector.xpath(
".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + post_selector.xpath(
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get(
"author_name"),
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
"//a[@class='card_title_fname']/@href").get(default=''),
total_replay_num=post_field_value.get("reply_num", 0))
user_selector = post_selector.xpath(f".//a[{self._class_contains('frs-author-name')}][1]")
title = self._selector_text(post_selector, f".//a[{self._class_contains('j_th_tit')}][1]")
if not title:
title = self._selector_text(post_selector, f".//*[{self._class_contains('threadlist_title')}]//a[1]")
user_nickname = (
post_field_value.get("author_nickname")
or post_field_value.get("author_name")
or self._selector_text(
post_selector, f".//a[{self._class_contains('frs-author-name')}][1]"
)
)
tieba_note = TiebaNote(
note_id=note_id,
title=title,
desc=self._selector_text(
post_selector, f".//div[{self._class_contains('threadlist_abs')}]"
),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=self._absolute_url(user_selector.xpath("./@href").get(default="")),
user_nickname=user_nickname,
tieba_name=tieba_name,
tieba_link=tieba_link,
total_replay_num=post_field_value.get("reply_num", 0),
)
result.append(tieba_note)
return result
@@ -114,31 +518,59 @@ class TieBaExtractor:
Tieba post detail object
"""
content_selector = Selector(text=page_content)
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
first_floor_selector = content_selector.xpath(
f"//div[{self._class_contains('l_post')} and {self._class_contains('j_l_post')}][1]"
)
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
note_id = only_view_author_link.split("?")[0].split("/")[-1]
if not note_id:
note_id_match = re.search(r'"thread_id"\s*:\s*"?(\d+)"?', page_content)
note_id = note_id_match.group(1) if note_id_match else ""
# Post reply count and reply page count
thread_num_infos = content_selector.xpath(
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']")
f"//div[@id='thread_theme_5']//li[{self._class_contains('l_reply_num')}]"
f"//span[{self._class_contains('red')}]"
)
# IP location and publish time
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
other_info_content = first_floor_selector.xpath(
f".//div[{self._class_contains('post-tail-wrap')}]"
).get(default="").strip()
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(),
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + first_floor_selector.xpath(
".//a[@class='p_author_face ']/@href").get(default='').strip(),
user_nickname=first_floor_selector.xpath(
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
default='').strip(),
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
"//a[@class='card_title_fname']/@href").get(default=''), ip_location=ip_location,
publish_time=publish_time,
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), )
note.title = note.title.replace(f"{note.tieba_name}】_Baidu Tieba", "")
tieba_name, tieba_link = self._extract_forum_info(content_selector, page_content)
first_floor_value = self.extract_data_field_value(first_floor_selector)
author_value = first_floor_value.get("author", {}) if first_floor_value else {}
author_link = first_floor_selector.xpath(
f".//a[{self._class_contains('p_author_face')} "
f"or {self._class_contains('p_author_name')}]/@href"
).get(default="")
note = TiebaNote(
note_id=note_id,
title=content_selector.xpath("//title/text()").get(default="").strip(),
desc=content_selector.xpath("//meta[@name='description']/@content").get(default="").strip(),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=self._absolute_url(author_link),
user_nickname=(
self._selector_text(first_floor_selector, f".//a[{self._class_contains('p_author_name')}][1]")
or author_value.get("user_nickname")
or author_value.get("user_name", "")
),
user_avatar=first_floor_selector.xpath(
f".//a[{self._class_contains('p_author_face')}]//img/@src"
).get(default="").strip(),
tieba_name=tieba_name,
tieba_link=tieba_link,
ip_location=ip_location,
publish_time=publish_time,
total_replay_num=(
thread_num_infos[0].xpath("./text()").get(default="0").strip()
if len(thread_num_infos) > 0 else 0
),
total_replay_page=(
thread_num_infos[1].xpath("./text()").get(default="0").strip()
if len(thread_num_infos) > 1 else 0
),
)
note.title = self._clean_title(note.title, note.tieba_name)
return note
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
@@ -151,30 +583,56 @@ class TieBaExtractor:
Returns:
List of first-level comment objects
"""
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
xpath_selector = f"//div[{self._class_contains('l_post')} and {self._class_contains('j_l_post')}]"
comment_list = Selector(text=page_content).xpath(xpath_selector)
content_selector = Selector(text=page_content)
tieba_name, tieba_link = self._extract_forum_info(content_selector, page_content)
result: List[TiebaComment] = []
for comment_selector in comment_list:
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
if not comment_field_value:
comment_content_value = comment_field_value.get("content", {}) if comment_field_value else {}
if not comment_content_value:
continue
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
other_info_content = comment_selector.xpath(
f".//div[{self._class_contains('post-tail-wrap')}]"
).get(default="").strip()
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
tieba_comment = TiebaComment(comment_id=str(comment_field_value.get("content").get("post_id")),
sub_comment_count=comment_field_value.get("content").get("comment_num"),
content=utils.extract_text_from_html(
comment_field_value.get("content").get("content")),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=const.TIEBA_URL + comment_selector.xpath(
".//a[@class='p_author_face ']/@href").get(default='').strip(),
user_nickname=comment_selector.xpath(
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
user_avatar=comment_selector.xpath(
".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
tieba_name=tieba_name, tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
ip_location=ip_location, publish_time=publish_time, note_id=note_id, )
user_selector = comment_selector.xpath(f".//a[{self._class_contains('p_author_name')}][1]")
user_avatar = comment_selector.xpath(
f".//a[{self._class_contains('p_author_face')}]//img/@src"
).get(default="").strip()
if not user_avatar and comment_field_value.get("author", {}).get("portrait"):
portrait = comment_field_value["author"]["portrait"]
user_avatar = (
"https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/"
f"{portrait}"
)
content_html = comment_content_value.get("content") or comment_selector.xpath(
f".//div[{self._class_contains('d_post_content')}]"
).get(default="")
user_nickname = (
self._selector_text(comment_selector, f".//a[{self._class_contains('p_author_name')}][1]")
or comment_field_value.get("author", {}).get("user_nickname")
or comment_field_value.get("author", {}).get("user_name", "")
)
tieba_comment = TiebaComment(
comment_id=str(
comment_content_value.get("post_id")
or comment_selector.xpath("./@data-pid").get(default="")
),
sub_comment_count=comment_content_value.get("comment_num") or 0,
content=utils.extract_text_from_html(content_html),
note_url=const.TIEBA_URL + f"/p/{note_id}",
user_link=self._absolute_url(user_selector.xpath("./@href").get(default="")),
user_nickname=user_nickname,
user_avatar=user_avatar,
tieba_id=str(comment_content_value.get("forum_id", "")),
tieba_name=tieba_name,
tieba_link=tieba_link,
ip_location=ip_location,
publish_time=publish_time,
note_id=note_id,
)
result.append(tieba_comment)
return result
@@ -190,21 +648,24 @@ class TieBaExtractor:
"""
selector = Selector(page_content)
comments = []
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
comment_ele_list = selector.xpath(
f"//li[{self._class_contains('lzl_single_post')} and {self._class_contains('j_lzl_s_p')}]"
)
for comment_ele in comment_ele_list:
comment_value = self.extract_data_field_value(comment_ele)
if not comment_value:
continue
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
comment_user_a_selector = comment_ele.xpath(
f"./a[{self._class_contains('j_user_card')} and {self._class_contains('lzl_p_p')}][1]"
)
content = utils.extract_text_from_html(
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
comment_ele.xpath(f".//span[{self._class_contains('lzl_content_main')}]").get(default=""))
comment = TiebaComment(
comment_id=str(comment_value.get("spid")), content=content,
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
user_nickname=comment_value.get("showname"),
user_link=self._absolute_url(comment_user_a_selector.xpath("./@href").get(default="")),
user_nickname=str(comment_value.get("showname") or ""),
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
publish_time=self._selector_text(comment_ele, f".//span[{self._class_contains('lzl_time')}]"),
parent_comment_id=parent_comment.comment_id,
note_id=parent_comment.note_id, note_url=parent_comment.note_url,
tieba_id=parent_comment.tieba_id, tieba_name=parent_comment.tieba_name,

View File

@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
import config
import pytest
from cmd_arg import parse_cmd
from media_platform.tieba import TieBaCrawler
@pytest.mark.asyncio
async def test_tieba_detail_cli_sets_specified_ids():
await parse_cmd(
[
"--platform",
"tieba",
"--type",
"detail",
"--specified_id",
"https://tieba.baidu.com/p/10451142633,9835114923",
]
)
assert config.TIEBA_SPECIFIED_ID_LIST == ["10451142633", "9835114923"]
@pytest.mark.asyncio
async def test_tieba_creator_cli_sets_creator_urls():
await parse_cmd(
[
"--platform",
"tieba",
"--type",
"creator",
"--creator_id",
"tb.1.example,https://tieba.baidu.com/home/main?id=tb.1.raw",
]
)
assert config.TIEBA_CREATOR_URL_LIST == [
"https://tieba.baidu.com/home/main?id=tb.1.example",
"https://tieba.baidu.com/home/main?id=tb.1.raw",
]
@pytest.mark.asyncio
async def test_tieba_detail_reads_runtime_specified_ids(monkeypatch):
crawler = TieBaCrawler()
seen_note_ids = []
async def fake_get_note_detail(note_id, semaphore):
seen_note_ids.append(note_id)
return None
async def fake_batch_get_comments(note_details):
return None
monkeypatch.setattr(config, "TIEBA_SPECIFIED_ID_LIST", ["10451142633"])
monkeypatch.setattr(crawler, "get_note_detail_async_task", fake_get_note_detail)
monkeypatch.setattr(crawler, "batch_get_note_comments", fake_batch_get_comments)
await crawler.get_specified_notes()
assert seen_note_ids == ["10451142633"]

View File

@@ -0,0 +1,110 @@
# -*- coding: utf-8 -*-
import pytest
from media_platform.tieba.client import BaiduTieBaClient
from model.m_baidu_tieba import TiebaComment, TiebaNote
class DummyPage:
url = "https://tieba.baidu.com/"
@pytest.mark.asyncio
async def test_search_uses_requested_page_number():
client = BaiduTieBaClient(playwright_page=DummyPage())
calls = []
async def fake_fetch(uri, method="GET", params=None, data=None, use_sign=False):
calls.append((uri, params))
return {"no": 0, "data": {"card_list": []}}
client._fetch_json_by_browser = fake_fetch
await client.get_notes_by_keyword("编程兼职", page=2, page_size=10)
assert calls[0][0] == "/mo/q/search/multsearch"
assert calls[0][1]["pn"] == 2
@pytest.mark.asyncio
async def test_comments_walk_pages_until_total_reply_page():
client = BaiduTieBaClient(playwright_page=DummyPage())
pages = []
note = TiebaNote(
note_id="9835114923",
title="title",
note_url="https://tieba.baidu.com/p/9835114923",
tieba_name="加工中心吧",
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
total_replay_page=2,
)
async def fake_get_page_data(note_id, page=1):
pages.append(page)
return {"forum": {"id": 1, "name": "加工中心"}, "post_list": []}
def fake_extract_comments(api_data, note_detail):
page = pages[-1]
return [
TiebaComment(
comment_id=str(page),
content="comment",
note_id=note_detail.note_id,
note_url=note_detail.note_url,
tieba_id="1",
tieba_name=note_detail.tieba_name,
tieba_link=note_detail.tieba_link,
)
]
client._get_pc_page_data = fake_get_page_data
client._page_extractor.extract_tieba_note_parent_comments_from_api = fake_extract_comments
await client.get_note_all_comments(note, crawl_interval=0, max_count=10)
assert pages == [1, 2]
@pytest.mark.asyncio
async def test_creator_feed_walks_until_has_more_false(monkeypatch):
client = BaiduTieBaClient(playwright_page=DummyPage())
pages = []
async def fake_get_notes_by_creator_portrait(portrait, page_number, page_size=20):
pages.append(page_number)
return {
"error_code": 0,
"data": {
"has_more": 1 if page_number == 1 else 0,
"list": [
{
"thread_info": {
"id": str(1000 + page_number),
"tid": str(1000 + page_number),
}
}
],
},
}
async def fake_get_note_by_id(note_id):
return TiebaNote(
note_id=note_id,
title="title",
note_url=f"https://tieba.baidu.com/p/{note_id}",
tieba_name="加工中心吧",
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
)
async def fake_sleep(_):
return None
client.get_notes_by_creator_portrait = fake_get_notes_by_creator_portrait
client.get_note_by_id = fake_get_note_by_id
monkeypatch.setattr("media_platform.tieba.client.asyncio.sleep", fake_sleep)
notes = await client.get_all_notes_by_creator_url("tb.1.creator", crawl_interval=0)
assert pages == [1, 2]
assert [note.note_id for note in notes] == ["1001", "1002"]

View File

@@ -0,0 +1,278 @@
# -*- coding: utf-8 -*-
from pathlib import Path
from media_platform.tieba.help import TieBaExtractor
from model.m_baidu_tieba import TiebaComment
FIXTURE_DIR = Path(__file__).parent.parent / "media_platform" / "tieba" / "test_data"
def read_fixture(name: str) -> str:
return (FIXTURE_DIR / name).read_text(encoding="utf-8")
def test_extract_search_note_list_from_keyword_page():
notes = TieBaExtractor.extract_search_note_list(read_fixture("search_keyword_notes.html"))
assert len(notes) == 10
assert notes[0].note_id == "9117888152"
assert notes[0].title.startswith("武汉交互空间科技")
assert notes[0].tieba_name == "武汉交互空间"
assert notes[0].user_nickname == "VR虚拟达人"
def test_extract_search_note_list_from_current_pc_card_page():
page_content = """
<html>
<body>
<div class="threadcardclass thread-new3 index-feed-cards">
<a class="action-link-bg" href="https://tieba.baidu.com/p/10559655942?fr=undefined"></a>
<div class="thread-forum-name display-flex align-center">
<span class="forum-name-text">诸城吧</span>
</div>
<div class="top-title">
<span class="forum-attention user">754023117</span>
<span>发布于 2026-3-15</span>
</div>
<div class="title-wrap"><span>数,英,编程老师</span></div>
<div class="abstract-wrap">
<span>培训班需求,数学,英语,编程老师,专职兼职都可</span>
</div>
<a class="comment-link-zone" href="https://tieba.baidu.com/p/10559655942?showComment=1">
<span class="action-number">19</span>
</a>
</div>
</body>
</html>
"""
notes = TieBaExtractor.extract_search_note_list(page_content)
assert len(notes) == 1
assert notes[0].note_id == "10559655942"
assert notes[0].title == "数,英,编程老师"
assert notes[0].desc == "培训班需求,数学,英语,编程老师,专职兼职都可"
assert notes[0].tieba_name == "诸城吧"
assert notes[0].tieba_link.endswith("kw=%E8%AF%B8%E5%9F%8E")
assert notes[0].user_nickname == "754023117"
assert notes[0].publish_time == "2026-3-15"
assert notes[0].total_replay_num == 19
def test_extract_search_note_list_from_current_pc_api():
api_data = {
"no": 0,
"error": "success",
"data": {
"card_list": [
{"cardInfo": "related_user", "cardStyle": "related_user", "data": {}},
{
"cardInfo": "thread",
"cardStyle": "thread",
"data": {
"tid": "10559655942",
"title": "数,英,编程老师",
"content": "培训班需求,数学,英语,编程老师,专职兼职都可",
"time": 1773552643,
"user": {
"show_nickname": "754023117",
"portrait": "https://example.com/avatar.jpg",
},
"post_num": 19,
"forum_name": "诸城",
},
},
]
},
}
notes = TieBaExtractor().extract_search_note_list_from_api(api_data)
assert len(notes) == 1
assert notes[0].note_id == "10559655942"
assert notes[0].title == "数,英,编程老师"
assert notes[0].tieba_name == "诸城吧"
assert notes[0].total_replay_num == 19
assert notes[0].publish_time
def test_extract_note_detail_and_comments_from_current_pc_api():
api_data = {
"error_code": 0,
"thread": {
"id": 10451142633,
"title": "这X尔斯对比巴尔斯我只能说ID正确允许居功自傲",
"reply_num": 15,
"create_time": 1769951446,
},
"forum": {"id": 1627732, "name": "dota2"},
"page": {"total_page": 1},
"first_floor": {
"id": 153154064746,
"author_id": 4089186644,
"time": 1769951446,
"content": [{"type": 0, "text": "皮队败决处刑德国编程钢琴师兼职数学家"}],
},
"post_list": [
{
"id": 153154097267,
"author_id": 6614897968,
"time": 1769952062,
"content": [{"type": 0, "text": "xg现在大树阵容另一个辅助不选控制"}],
"sub_post_number": 4,
}
],
"user_list": [
{
"id": 4089186644,
"name_show": "泰高祖蒙斯克",
"portrait": "tb.1.f893a7af",
"ip_address": "广东",
},
{
"id": 6614897968,
"name_show": "期胡希3",
"portrait": "tb.1.4d0471d4",
"ip_address": "河北",
},
],
}
extractor = TieBaExtractor()
note = extractor.extract_note_detail_from_api(api_data)
comments = extractor.extract_tieba_note_parent_comments_from_api(api_data, note)
assert note.note_id == "10451142633"
assert note.title == "这X尔斯对比巴尔斯我只能说ID正确允许居功自傲"
assert note.desc == "皮队败决处刑德国编程钢琴师兼职数学家"
assert note.user_nickname == "泰高祖蒙斯克"
assert note.tieba_name == "dota2吧"
assert note.total_replay_num == 15
assert note.total_replay_page == 1
assert note.ip_location == "广东"
assert len(comments) == 1
assert comments[0].comment_id == "153154097267"
assert comments[0].content == "xg现在大树阵容另一个辅助不选控制"
assert comments[0].user_nickname == "期胡希3"
assert comments[0].sub_comment_count == 4
assert comments[0].ip_location == "河北"
def test_extract_creator_info_and_threads_from_current_pc_api():
creator_api = {
"error_code": 0,
"data": {
"user": {
"id": 3546493137,
"name": "拜月教Alice",
"name_show": "米米世界大手子",
"portrait": "tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA?t=1777543466",
"fans_num": 58,
"concern_num": 1,
"sex": 1,
"tb_age": "7.8",
"ip_address": "广东",
}
},
}
feed_api = {
"error_code": 0,
"data": {
"list": [
{"type": 1, "thread_info": {"id": 10208192951, "tid": 10208192951}},
{"type": 1, "thread_info": {"id": 9835114923}},
]
},
}
extractor = TieBaExtractor()
creator = extractor.extract_creator_info_from_api(creator_api)
thread_ids = extractor.extract_creator_thread_id_list_from_api(feed_api)
assert creator.user_id == "3546493137"
assert creator.user_name == "拜月教Alice"
assert creator.nickname == "米米世界大手子"
assert creator.fans == 58
assert creator.follows == 1
assert creator.ip_location == "广东"
assert creator.registration_duration == "7.8"
assert thread_ids == ["10208192951", "9835114923"]
def test_extract_tieba_note_list_from_current_frs_api():
api_data = {
"error_code": 0,
"forum": {
"id": 351091,
"name": "加工中心",
"tids": "10376710029,10636556989,",
},
}
notes = TieBaExtractor().extract_tieba_note_list_from_frs_api(api_data)
assert [note.note_id for note in notes] == ["10376710029", "10636556989"]
assert notes[0].note_url == "https://tieba.baidu.com/p/10376710029"
assert notes[0].tieba_name == "加工中心吧"
assert notes[0].tieba_link.endswith("kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83")
def test_extract_tieba_note_list_from_bigpipe_thread_page():
notes = TieBaExtractor().extract_tieba_note_list(read_fixture("tieba_note_list.html"))
assert len(notes) == 48
assert notes[0].note_id == "9079949995"
assert notes[0].title == "盗墓笔记全集+txt小说已整理"
assert notes[0].user_nickname == "公子伯仲"
assert notes[0].tieba_name == "盗墓笔记吧"
assert notes[0].tieba_link.endswith("kw=%E7%9B%97%E5%A2%93%E7%AC%94%E8%AE%B0&ie=utf-8")
def test_extract_note_detail_from_post_page():
note = TieBaExtractor().extract_note_detail(read_fixture("note_detail.html"))
assert note.note_id == "9117905169"
assert note.title == "对于一个父亲来说这个女儿14岁就死了"
assert note.user_nickname == "章景轩"
assert note.tieba_name == "以太比特吧"
assert note.total_replay_num == 786
assert note.total_replay_page == 13
assert note.ip_location == "广东"
def test_extract_parent_comments_from_post_page():
comments = TieBaExtractor().extract_tieba_note_parment_comments(
read_fixture("note_comments.html"),
"9119688421",
)
assert len(comments) == 30
assert comments[0].comment_id == "150726491368"
assert comments[0].content == "中国队第22金无悬念"
assert comments[0].user_nickname == "heinzfrentzen"
assert comments[0].tieba_name == "网球风云吧"
assert comments[0].ip_location == "福建"
def test_extract_sub_comments_with_class_token_matching():
parent = TiebaComment(
comment_id="150726496253",
content="parent",
note_id="9119688421",
note_url="https://tieba.baidu.com/p/9119688421",
tieba_id="4513750",
tieba_name="网球风云吧",
tieba_link="https://tieba.baidu.com/f?kw=%E7%BD%91%E7%90%83%E9%A3%8E%E4%BA%91",
)
comments = TieBaExtractor().extract_tieba_note_sub_comments(
read_fixture("note_sub_comments.html"),
parent,
)
assert len(comments) >= 10
assert comments[0].comment_id
assert comments[0].parent_comment_id == parent.comment_id
assert comments[0].user_link.startswith("https://tieba.baidu.com/home/main")