mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-10 20:47:39 +08:00
fix: restore Tieba crawling after PC page rewrite
Tieba search, detail, comments, creator, and forum-list pages now rely on the current signed PC JSON APIs instead of brittle HTML selectors. The CLI also maps Tieba detail and creator arguments into the platform-specific config so command-line runs exercise the intended mode. Constraint: Tieba PC pages no longer expose stable HTML structures for search, creator, and forum-list extraction Constraint: Current PC APIs require browser cookies, tbs, and the web client signing convention Rejected: Keep expanding HTML selectors | search and creator pages returned large documents with empty parsed results after the redesign Confidence: high Scope-risk: moderate Directive: Do not replace these API paths with page HTML parsing without re-verifying the current Tieba network requests Tested: uv run pytest tests/test_tieba_client_pagination.py tests/test_cmd_arg_tieba.py tests/test_tieba_extractor.py -q Tested: uv run python -m py_compile cmd_arg/arg.py media_platform/tieba/help.py media_platform/tieba/client.py media_platform/tieba/core.py tests/test_cmd_arg_tieba.py tests/test_tieba_client_pagination.py tests/test_tieba_extractor.py Tested: uv run main.py --platform tieba --type search --keywords 编程兼职 --get_comment false Tested: uv run main.py --platform tieba --type detail --specified_id 9835114923 --get_comment true --max_comments_count_singlenotes 3 Tested: uv run main.py --platform tieba --type creator --creator_id https://tieba.baidu.com/home/main?id=tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA --get_comment false Not-tested: Second-level Tieba comment API migration; this path still uses the existing /p/comment HTML parser Not-tested: Full pytest suite has one pre-existing unrelated XHS Excel store assertion failure
This commit is contained in:
@@ -22,6 +22,7 @@ from __future__ import annotations
|
||||
|
||||
|
||||
import sys
|
||||
import re
|
||||
from enum import Enum
|
||||
from types import SimpleNamespace
|
||||
from typing import Iterable, Optional, Sequence, Type, TypeVar
|
||||
@@ -135,6 +136,21 @@ def _inject_init_db_default(args: Sequence[str]) -> list[str]:
|
||||
return normalized
|
||||
|
||||
|
||||
def _normalize_tieba_note_id(value: str) -> str:
|
||||
"""Accept a raw Tieba thread id or a /p/<id> URL."""
|
||||
value = value.strip()
|
||||
match = re.search(r"/p/(\d+)", value)
|
||||
return match.group(1) if match else value
|
||||
|
||||
|
||||
def _normalize_tieba_creator_url(value: str) -> str:
|
||||
"""Accept a Tieba creator homepage URL or a portrait id."""
|
||||
value = value.strip()
|
||||
if value.startswith("http://") or value.startswith("https://"):
|
||||
return value
|
||||
return f"https://tieba.baidu.com/home/main?id={value}"
|
||||
|
||||
|
||||
async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
"""Parse command line arguments using Typer."""
|
||||
|
||||
@@ -344,6 +360,10 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
config.WEIBO_SPECIFIED_ID_LIST = specified_id_list
|
||||
elif platform == PlatformEnum.KUAISHOU:
|
||||
config.KS_SPECIFIED_ID_LIST = specified_id_list
|
||||
elif platform == PlatformEnum.TIEBA:
|
||||
config.TIEBA_SPECIFIED_ID_LIST = [
|
||||
_normalize_tieba_note_id(item) for item in specified_id_list
|
||||
]
|
||||
|
||||
if creator_id_list:
|
||||
if platform == PlatformEnum.XHS:
|
||||
@@ -356,6 +376,10 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
|
||||
config.WEIBO_CREATOR_ID_LIST = creator_id_list
|
||||
elif platform == PlatformEnum.KUAISHOU:
|
||||
config.KS_CREATOR_ID_LIST = creator_id_list
|
||||
elif platform == PlatformEnum.TIEBA:
|
||||
config.TIEBA_CREATOR_URL_LIST = [
|
||||
_normalize_tieba_creator_url(item) for item in creator_id_list
|
||||
]
|
||||
|
||||
return SimpleNamespace(
|
||||
platform=config.PLATFORM,
|
||||
|
||||
@@ -18,9 +18,10 @@
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
from typing import Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode, quote
|
||||
from urllib.parse import urlencode, quote, parse_qs, unquote, urlparse
|
||||
|
||||
import requests
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
@@ -35,6 +36,8 @@ from tools import utils
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import TieBaExtractor
|
||||
|
||||
PC_SIGN_SECRET = "36770b1f34c9bbf2e7d1a99d2b82fa9e"
|
||||
|
||||
|
||||
class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
@@ -58,6 +61,128 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
self._page_extractor = TieBaExtractor()
|
||||
self.default_ip_proxy = default_ip_proxy
|
||||
self.playwright_page = playwright_page # Playwright page object
|
||||
self._pc_tbs = ""
|
||||
|
||||
@staticmethod
|
||||
def _sign_pc_params(params: Dict[str, Any]) -> str:
|
||||
sign_text = ""
|
||||
for key in sorted(params):
|
||||
if key in {"sign", "sig"} or params[key] is None:
|
||||
continue
|
||||
sign_text += f"{key}={params[key]}"
|
||||
sign_text += PC_SIGN_SECRET
|
||||
return hashlib.md5(sign_text.encode("utf-8")).hexdigest()
|
||||
|
||||
async def _ensure_tieba_origin(self) -> None:
|
||||
if not self.playwright_page:
|
||||
raise Exception("playwright_page is required for tieba PC API requests")
|
||||
if not self.playwright_page.url.startswith(self._host):
|
||||
await self.playwright_page.goto(self._host, wait_until="domcontentloaded")
|
||||
|
||||
async def _fetch_json_by_browser(
|
||||
self,
|
||||
uri: str,
|
||||
method: str = "GET",
|
||||
params: Optional[Dict[str, Any]] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
use_sign: bool = False,
|
||||
) -> Dict:
|
||||
"""
|
||||
Fetch current Tieba PC JSON APIs from the browser context.
|
||||
These APIs rely on logged-in browser cookies and Baidu's PC signing
|
||||
convention, while Python requests can be blocked by local proxy/TLS.
|
||||
"""
|
||||
await self._ensure_tieba_origin()
|
||||
params = {k: v for k, v in (params or {}).items() if v is not None}
|
||||
data = {k: v for k, v in (data or {}).items() if v is not None}
|
||||
if use_sign:
|
||||
sign_source = data if method.upper() == "POST" else params
|
||||
sign_source.setdefault("subapp_type", "pc")
|
||||
sign_source.setdefault("_client_type", "20")
|
||||
sign_source["sign"] = self._sign_pc_params(sign_source)
|
||||
|
||||
url = f"{self._host}{uri}"
|
||||
if params:
|
||||
url = f"{url}?{urlencode(params)}"
|
||||
body = urlencode(data) if data else ""
|
||||
response = await self.playwright_page.evaluate(
|
||||
"""async ({ url, method, body }) => {
|
||||
const headers = { "Accept": "application/json, text/plain, */*" };
|
||||
const options = { method, credentials: "include", headers };
|
||||
if (method === "POST") {
|
||||
headers["Content-Type"] = "application/x-www-form-urlencoded;charset=UTF-8";
|
||||
options.body = body;
|
||||
}
|
||||
const resp = await fetch(url, options);
|
||||
const text = await resp.text();
|
||||
return { status: resp.status, text };
|
||||
}""",
|
||||
{"url": url, "method": method.upper(), "body": body},
|
||||
)
|
||||
if response["status"] != 200:
|
||||
raise Exception(f"Tieba PC API failed, status={response['status']}, url={url}")
|
||||
try:
|
||||
json_data = json.loads(response["text"])
|
||||
except json.JSONDecodeError as exc:
|
||||
raise Exception(f"Tieba PC API returned non-JSON, url={url}, body={response['text'][:500]}") from exc
|
||||
error_code = json_data.get("error_code", json_data.get("no", 0))
|
||||
if str(error_code) not in {"0", "None"}:
|
||||
raise Exception(f"Tieba PC API error, url={url}, response={json_data}")
|
||||
return json_data
|
||||
|
||||
async def _get_pc_tbs(self) -> str:
|
||||
if self._pc_tbs:
|
||||
return self._pc_tbs
|
||||
sync_data = await self._fetch_json_by_browser(
|
||||
"/c/s/pc/sync",
|
||||
params={"subapp_type": "pc", "_client_type": "20"},
|
||||
use_sign=True,
|
||||
)
|
||||
self._pc_tbs = (
|
||||
sync_data.get("data", {})
|
||||
.get("anti", {})
|
||||
.get("tbs", "")
|
||||
)
|
||||
if not self._pc_tbs:
|
||||
raise Exception(f"Can not get Tieba tbs from pc sync API: {sync_data}")
|
||||
return self._pc_tbs
|
||||
|
||||
async def _get_pc_page_data(self, note_id: str, page: int = 1) -> Dict:
|
||||
tbs = await self._get_pc_tbs()
|
||||
return await self._fetch_json_by_browser(
|
||||
"/c/f/pb/page_pc",
|
||||
method="POST",
|
||||
data={
|
||||
"pn": page,
|
||||
"lz": 0,
|
||||
"r": 2,
|
||||
"mark_type": 0,
|
||||
"back": 0,
|
||||
"fr": "",
|
||||
"kz": note_id,
|
||||
"session_request_times": 1,
|
||||
"tbs": tbs,
|
||||
"subapp_type": "pc",
|
||||
"_client_type": "20",
|
||||
},
|
||||
use_sign=True,
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _extract_creator_portrait(creator_url: str) -> str:
|
||||
creator_url = (creator_url or "").strip()
|
||||
if not creator_url:
|
||||
return ""
|
||||
if not creator_url.startswith(("http://", "https://")):
|
||||
return creator_url.split("?")[0]
|
||||
parsed = urlparse(creator_url)
|
||||
query = parse_qs(parsed.query)
|
||||
portrait = (
|
||||
query.get("id", [""])[0]
|
||||
or query.get("portrait", [""])[0]
|
||||
or query.get("un", [""])[0]
|
||||
)
|
||||
return unquote(portrait).split("?")[0]
|
||||
|
||||
def _sync_request(self, method, url, proxy=None, **kwargs):
|
||||
"""
|
||||
@@ -270,35 +395,29 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.error("[BaiduTieBaClient.get_notes_by_keyword] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based search")
|
||||
|
||||
# Construct search URL
|
||||
# Example: https://tieba.baidu.com/f/search/res?ie=utf-8&qw=keyword
|
||||
search_url = f"{self._host}/f/search/res"
|
||||
params = {
|
||||
"ie": "utf-8",
|
||||
"qw": keyword,
|
||||
"rn": page_size,
|
||||
"rn": max(page_size, 20),
|
||||
"st": sort.value,
|
||||
"word": keyword,
|
||||
"needbrand": 1,
|
||||
"sug_type": 2,
|
||||
"pn": page,
|
||||
"sm": sort.value,
|
||||
"only_thread": note_type.value,
|
||||
"come_from": "search",
|
||||
"subapp_type": "pc",
|
||||
"_client_type": "20",
|
||||
}
|
||||
|
||||
# Concatenate full URL
|
||||
full_url = f"{search_url}?{urlencode(params)}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Accessing search page: {full_url}")
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_notes_by_keyword] Accessing search API: "
|
||||
f"{self._host}/mo/q/search/multsearch?{urlencode(params)}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Use Playwright to access search page
|
||||
await self.playwright_page.goto(full_url, wait_until="domcontentloaded")
|
||||
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Successfully retrieved search page HTML, length: {len(page_content)}")
|
||||
|
||||
# Extract search results
|
||||
notes = self._page_extractor.extract_search_note_list(page_content)
|
||||
api_data = await self._fetch_json_by_browser(
|
||||
"/mo/q/search/multsearch",
|
||||
params=params,
|
||||
use_sign=True,
|
||||
)
|
||||
notes = self._page_extractor.extract_search_note_list_from_api(api_data)[:page_size]
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_keyword] Extracted {len(notes)} posts")
|
||||
return notes
|
||||
|
||||
@@ -319,23 +438,11 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.error("[BaiduTieBaClient.get_note_by_id] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based note detail fetching")
|
||||
|
||||
# Construct post detail URL
|
||||
note_url = f"{self._host}/p/{note_id}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Accessing post detail page: {note_url}")
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Accessing post detail API, note_id: {note_id}")
|
||||
|
||||
try:
|
||||
# Use Playwright to access post detail page
|
||||
await self.playwright_page.goto(note_url, wait_until="domcontentloaded")
|
||||
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_by_id] Successfully retrieved post detail HTML, length: {len(page_content)}")
|
||||
|
||||
# Extract post details
|
||||
note_detail = self._page_extractor.extract_note_detail(page_content)
|
||||
api_data = await self._get_pc_page_data(note_id=note_id, page=1)
|
||||
note_detail = self._page_extractor.extract_note_detail_from_api(api_data)
|
||||
return note_detail
|
||||
|
||||
except Exception as e:
|
||||
@@ -367,23 +474,15 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
current_page = 1
|
||||
|
||||
while note_detail.total_replay_page >= current_page and len(result) < max_count:
|
||||
# Construct comment page URL
|
||||
comment_url = f"{self._host}/p/{note_detail.note_id}?pn={current_page}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_note_all_comments] Accessing comment page: {comment_url}")
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_note_all_comments] Accessing comment API, "
|
||||
f"note_id: {note_detail.note_id}, page: {current_page}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Use Playwright to access comment page
|
||||
await self.playwright_page.goto(comment_url, wait_until="domcontentloaded")
|
||||
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
|
||||
# Extract comments
|
||||
comments = self._page_extractor.extract_tieba_note_parment_comments(
|
||||
page_content, note_id=note_detail.note_id
|
||||
api_data = await self._get_pc_page_data(note_id=note_detail.note_id, page=current_page)
|
||||
comments = self._page_extractor.extract_tieba_note_parent_comments_from_api(
|
||||
api_data, note_detail=note_detail
|
||||
)
|
||||
|
||||
if not comments:
|
||||
@@ -498,7 +597,7 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
|
||||
async def get_notes_by_tieba_name(self, tieba_name: str, page_num: int) -> List[TiebaNote]:
|
||||
"""
|
||||
Get post list by Tieba name (uses Playwright to access page, avoiding API detection)
|
||||
Get post list by Tieba name from current PC forum JSON API.
|
||||
Args:
|
||||
tieba_name: Tieba name
|
||||
page_num: Page number
|
||||
@@ -510,23 +609,33 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.error("[BaiduTieBaClient.get_notes_by_tieba_name] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based tieba note fetching")
|
||||
|
||||
# Construct Tieba post list URL
|
||||
tieba_url = f"{self._host}/f?kw={quote(tieba_name)}&pn={page_num}"
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Accessing Tieba page: {tieba_url}")
|
||||
page_size = 30
|
||||
api_page = page_num // page_size + 1
|
||||
tbs = await self._get_pc_tbs()
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_notes_by_tieba_name] Accessing Tieba FRS API, "
|
||||
f"tieba_name: {tieba_name}, page: {api_page}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Use Playwright to access Tieba page
|
||||
await self.playwright_page.goto(tieba_url, wait_until="domcontentloaded")
|
||||
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Successfully retrieved Tieba page HTML, length: {len(page_content)}")
|
||||
|
||||
# Extract post list
|
||||
notes = self._page_extractor.extract_tieba_note_list(page_content)
|
||||
api_data = await self._fetch_json_by_browser(
|
||||
"/c/f/frs/page_pc",
|
||||
method="POST",
|
||||
data={
|
||||
"kw": quote(tieba_name),
|
||||
"pn": api_page,
|
||||
"sort_type": -1,
|
||||
"is_newfrs": 1,
|
||||
"is_newfeed": 1,
|
||||
"rn": page_size,
|
||||
"rn_need": 10,
|
||||
"tbs": tbs,
|
||||
"subapp_type": "pc",
|
||||
"_client_type": "20",
|
||||
},
|
||||
use_sign=True,
|
||||
)
|
||||
notes = self._page_extractor.extract_tieba_note_list_from_frs_api(api_data)[:page_size]
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_notes_by_tieba_name] Extracted {len(notes)} posts")
|
||||
return notes
|
||||
|
||||
@@ -534,38 +643,72 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_notes_by_tieba_name] Failed to get Tieba post list: {e}")
|
||||
raise
|
||||
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> str:
|
||||
async def get_creator_info_by_url(self, creator_url: str) -> TiebaCreator:
|
||||
"""
|
||||
Get creator information by creator URL (uses Playwright to access page, avoiding API detection)
|
||||
Get creator information by creator URL from current PC JSON API.
|
||||
Args:
|
||||
creator_url: Creator homepage URL
|
||||
|
||||
Returns:
|
||||
str: Page HTML content
|
||||
TiebaCreator: Creator information
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_creator_info_by_url] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based creator info fetching")
|
||||
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Accessing creator homepage: {creator_url}")
|
||||
portrait = self._extract_creator_portrait(creator_url)
|
||||
if not portrait:
|
||||
raise Exception(f"Can not extract Tieba creator portrait from url: {creator_url}")
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_creator_info_by_url] Accessing creator info API, portrait: {portrait}"
|
||||
)
|
||||
|
||||
try:
|
||||
# Use Playwright to access creator homepage
|
||||
await self.playwright_page.goto(creator_url, wait_until="domcontentloaded")
|
||||
|
||||
# Wait for page loading, using delay setting from config file
|
||||
await asyncio.sleep(config.CRAWLER_MAX_SLEEP_SEC)
|
||||
|
||||
# Get page HTML content
|
||||
page_content = await self.playwright_page.content()
|
||||
utils.logger.info(f"[BaiduTieBaClient.get_creator_info_by_url] Successfully retrieved creator homepage HTML, length: {len(page_content)}")
|
||||
|
||||
return page_content
|
||||
api_data = await self._fetch_json_by_browser(
|
||||
"/c/u/pc/homeSidebarRight",
|
||||
params={
|
||||
"portrait": portrait,
|
||||
"un": "",
|
||||
"subapp_type": "pc",
|
||||
"_client_type": "20",
|
||||
},
|
||||
use_sign=True,
|
||||
)
|
||||
return self._page_extractor.extract_creator_info_from_api(api_data)
|
||||
|
||||
except Exception as e:
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] Failed to get creator homepage: {e}")
|
||||
utils.logger.error(f"[BaiduTieBaClient.get_creator_info_by_url] Failed to get creator info: {e}")
|
||||
raise
|
||||
|
||||
async def get_notes_by_creator_portrait(
|
||||
self, portrait: str, page_number: int, page_size: int = 20
|
||||
) -> Dict:
|
||||
"""
|
||||
Get creator's thread feed by creator portrait from current PC JSON API.
|
||||
"""
|
||||
if not self.playwright_page:
|
||||
utils.logger.error("[BaiduTieBaClient.get_notes_by_creator_portrait] playwright_page is None, cannot use browser mode")
|
||||
raise Exception("playwright_page is required for browser-based creator notes fetching")
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_notes_by_creator_portrait] Accessing creator feed API, "
|
||||
f"portrait: {portrait}, page: {page_number}"
|
||||
)
|
||||
return await self._fetch_json_by_browser(
|
||||
"/c/u/feed/myThread",
|
||||
params={
|
||||
"pn": page_number,
|
||||
"rn": page_size,
|
||||
"portrait": portrait,
|
||||
"type": 1,
|
||||
"un": "",
|
||||
"subapp_type": "pc",
|
||||
"_client_type": "20",
|
||||
},
|
||||
use_sign=True,
|
||||
)
|
||||
|
||||
async def get_notes_by_creator(self, user_name: str, page_number: int) -> Dict:
|
||||
"""
|
||||
Get creator's posts by creator (uses Playwright to access page, avoiding API detection)
|
||||
@@ -648,12 +791,12 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
while notes_has_more == 1 and (max_note_count == 0 or total_get_count < max_note_count):
|
||||
notes_res = await self.get_notes_by_creator(user_name, page_number)
|
||||
if not notes_res or notes_res.get("no") != 0:
|
||||
utils.logger.error(f"[WeiboClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||
utils.logger.error(f"[TieBaClient.get_notes_by_creator] got user_name:{user_name} notes failed, notes_res: {notes_res}")
|
||||
break
|
||||
notes_data = notes_res.get("data")
|
||||
notes_has_more = notes_data.get("has_more")
|
||||
notes = notes_data["thread_list"]
|
||||
utils.logger.info(f"[WeiboClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||
utils.logger.info(f"[TieBaClient.get_all_notes_by_creator] got user_name:{user_name} notes len : {len(notes)}")
|
||||
|
||||
note_detail_task = [self.get_note_by_id(note['thread_id']) for note in notes]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
@@ -664,3 +807,59 @@ class BaiduTieBaClient(AbstractApiClient):
|
||||
page_number += 1
|
||||
total_get_count += page_per_count
|
||||
return result
|
||||
|
||||
async def get_all_notes_by_creator_url(
|
||||
self,
|
||||
creator_url: str,
|
||||
crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None,
|
||||
max_note_count: int = 0,
|
||||
) -> List[TiebaNote]:
|
||||
"""
|
||||
Get all creator posts by current PC creator feed API.
|
||||
"""
|
||||
portrait = self._extract_creator_portrait(creator_url)
|
||||
if not portrait:
|
||||
raise Exception(f"Can not extract Tieba creator portrait from url: {creator_url}")
|
||||
|
||||
result: List[TiebaNote] = []
|
||||
page_number = 1
|
||||
page_size = 20
|
||||
|
||||
while max_note_count == 0 or len(result) < max_note_count:
|
||||
notes_res = await self.get_notes_by_creator_portrait(
|
||||
portrait=portrait,
|
||||
page_number=page_number,
|
||||
page_size=page_size,
|
||||
)
|
||||
thread_id_list = self._page_extractor.extract_creator_thread_id_list_from_api(notes_res)
|
||||
if not thread_id_list:
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_all_notes_by_creator_url] "
|
||||
f"Creator portrait:{portrait} page:{page_number} has no threads"
|
||||
)
|
||||
break
|
||||
|
||||
if max_note_count:
|
||||
thread_id_list = thread_id_list[: max_note_count - len(result)]
|
||||
|
||||
utils.logger.info(
|
||||
f"[BaiduTieBaClient.get_all_notes_by_creator_url] "
|
||||
f"got portrait:{portrait} thread ids len: {len(thread_id_list)}"
|
||||
)
|
||||
note_detail_task = [self.get_note_by_id(thread_id) for thread_id in thread_id_list]
|
||||
notes = await asyncio.gather(*note_detail_task)
|
||||
notes = [note for note in notes if note]
|
||||
if callback and notes:
|
||||
await callback(notes)
|
||||
result.extend(notes)
|
||||
|
||||
data = notes_res.get("data", {})
|
||||
has_more = int(data.get("has_more") or 0)
|
||||
if not has_more:
|
||||
break
|
||||
|
||||
await asyncio.sleep(crawl_interval)
|
||||
page_number += 1
|
||||
|
||||
return result
|
||||
|
||||
@@ -213,7 +213,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
tieba_limit_count = 50
|
||||
tieba_limit_count = 30
|
||||
if config.CRAWLER_MAX_NOTES_COUNT < tieba_limit_count:
|
||||
config.CRAWLER_MAX_NOTES_COUNT = tieba_limit_count
|
||||
for tieba_name in config.TIEBA_NAME_LIST:
|
||||
@@ -245,7 +245,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
page_number += tieba_limit_count
|
||||
|
||||
async def get_specified_notes(
|
||||
self, note_id_list: List[str] = config.TIEBA_SPECIFIED_ID_LIST
|
||||
self, note_id_list: Optional[List[str]] = None
|
||||
):
|
||||
"""
|
||||
Get the information and comments of the specified post
|
||||
@@ -255,6 +255,8 @@ class TieBaCrawler(AbstractCrawler):
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if note_id_list is None:
|
||||
note_id_list = config.TIEBA_SPECIFIED_ID_LIST
|
||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||
task_list = [
|
||||
self.get_note_detail_async_task(note_id=note_id, semaphore=semaphore)
|
||||
@@ -365,18 +367,15 @@ class TieBaCrawler(AbstractCrawler):
|
||||
|
||||
"""
|
||||
utils.logger.info(
|
||||
"[WeiboCrawler.get_creators_and_notes] Begin get weibo creators"
|
||||
"[TieBaCrawler.get_creators_and_notes] Begin get tieba creators"
|
||||
)
|
||||
for creator_url in config.TIEBA_CREATOR_URL_LIST:
|
||||
creator_page_html_content = await self.tieba_client.get_creator_info_by_url(
|
||||
creator_info: TiebaCreator = await self.tieba_client.get_creator_info_by_url(
|
||||
creator_url=creator_url
|
||||
)
|
||||
creator_info: TiebaCreator = self._page_extractor.extract_creator_info(
|
||||
creator_page_html_content
|
||||
)
|
||||
if creator_info:
|
||||
utils.logger.info(
|
||||
f"[WeiboCrawler.get_creators_and_notes] creator info: {creator_info}"
|
||||
f"[TieBaCrawler.get_creators_and_notes] creator info: {creator_info}"
|
||||
)
|
||||
if not creator_info:
|
||||
raise Exception("Get creator info error")
|
||||
@@ -385,12 +384,11 @@ class TieBaCrawler(AbstractCrawler):
|
||||
|
||||
# Get all note information of the creator
|
||||
all_notes_list = (
|
||||
await self.tieba_client.get_all_notes_by_creator_user_name(
|
||||
user_name=creator_info.user_name,
|
||||
await self.tieba_client.get_all_notes_by_creator_url(
|
||||
creator_url=creator_url,
|
||||
crawl_interval=0,
|
||||
callback=tieba_store.batch_update_tieba_notes,
|
||||
max_note_count=config.CRAWLER_MAX_NOTES_COUNT,
|
||||
creator_page_html_content=creator_page_html_content,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -398,7 +396,7 @@ class TieBaCrawler(AbstractCrawler):
|
||||
|
||||
else:
|
||||
utils.logger.error(
|
||||
f"[WeiboCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
|
||||
f"[TieBaCrawler.get_creators_and_notes] get creator info error, creator_url:{creator_url}"
|
||||
)
|
||||
|
||||
async def _navigate_to_tieba_via_baidu(self):
|
||||
|
||||
@@ -22,8 +22,8 @@
|
||||
import html
|
||||
import json
|
||||
import re
|
||||
from typing import Dict, List, Tuple
|
||||
from urllib.parse import parse_qs, unquote
|
||||
from typing import Any, Dict, List, Tuple
|
||||
from urllib.parse import parse_qs, quote, unquote, urljoin
|
||||
|
||||
from parsel import Selector
|
||||
|
||||
@@ -39,6 +39,306 @@ class TieBaExtractor:
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def _class_contains(class_name: str) -> str:
|
||||
return f"contains(concat(' ', normalize-space(@class), ' '), ' {class_name} ')"
|
||||
|
||||
@staticmethod
|
||||
def _normalize_text(text: str) -> str:
|
||||
return re.sub(r"\s+", " ", text or "").strip()
|
||||
|
||||
@classmethod
|
||||
def _selector_text(cls, selector: Selector, xpath: str) -> str:
|
||||
node = selector.xpath(xpath)
|
||||
if not node:
|
||||
return ""
|
||||
return cls._normalize_text(node[0].xpath("string(.)").get(default=""))
|
||||
|
||||
@staticmethod
|
||||
def _absolute_url(url: str) -> str:
|
||||
return urljoin(const.TIEBA_URL, (url or "").strip())
|
||||
|
||||
@staticmethod
|
||||
def _extract_note_id_from_url(url: str) -> str:
|
||||
note_id_match = re.search(r"/p/(\d+)", url or "")
|
||||
return note_id_match.group(1) if note_id_match else ""
|
||||
|
||||
@staticmethod
|
||||
def _text_to_int(text: str) -> int:
|
||||
match = re.search(r"\d+", text or "")
|
||||
return int(match.group(0)) if match else 0
|
||||
|
||||
@staticmethod
|
||||
def _ensure_tieba_suffix(tieba_name: str) -> str:
|
||||
tieba_name = (tieba_name or "").strip()
|
||||
return tieba_name if not tieba_name or tieba_name.endswith("吧") else f"{tieba_name}吧"
|
||||
|
||||
@classmethod
|
||||
def _tieba_link_from_name(cls, tieba_name: str) -> str:
|
||||
if not tieba_name:
|
||||
return const.TIEBA_URL
|
||||
return f"{const.TIEBA_URL}/f?kw={quote(tieba_name.removesuffix('吧'))}"
|
||||
|
||||
@classmethod
|
||||
def _extract_api_content_text(cls, content: Any) -> str:
|
||||
if isinstance(content, str):
|
||||
return cls._normalize_text(content)
|
||||
if not isinstance(content, list):
|
||||
return ""
|
||||
text_list: List[str] = []
|
||||
for item in content:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
text = item.get("text") or item.get("c") or ""
|
||||
if text:
|
||||
text_list.append(str(text))
|
||||
return cls._normalize_text("".join(text_list))
|
||||
|
||||
@staticmethod
|
||||
def _api_user_map(api_data: Dict) -> Dict[str, Dict]:
|
||||
return {str(user.get("id")): user for user in api_data.get("user_list", []) if user.get("id")}
|
||||
|
||||
@staticmethod
|
||||
def _api_user_link(user: Dict) -> str:
|
||||
portrait = (user or {}).get("portrait", "")
|
||||
if not portrait:
|
||||
return ""
|
||||
return f"{const.TIEBA_URL}/home/main?id={quote(str(portrait))}"
|
||||
|
||||
@staticmethod
|
||||
def _api_user_avatar(user: Dict) -> str:
|
||||
image_data = (
|
||||
(user or {})
|
||||
.get("user_show_info", {})
|
||||
.get("feed_head", {})
|
||||
.get("image_data", {})
|
||||
)
|
||||
return image_data.get("img_url") or (
|
||||
"https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/"
|
||||
f"{user.get('portrait', '')}"
|
||||
if user and user.get("portrait")
|
||||
else ""
|
||||
)
|
||||
|
||||
def extract_search_note_list_from_api(self, api_data: Dict) -> List[TiebaNote]:
|
||||
"""
|
||||
Extract Tieba post list from current PC search JSON API.
|
||||
"""
|
||||
result: List[TiebaNote] = []
|
||||
cards = api_data.get("data", {}).get("card_list", [])
|
||||
for card in cards:
|
||||
if card.get("cardInfo") != "thread" and card.get("cardStyle") != "thread":
|
||||
continue
|
||||
item = card.get("data") or {}
|
||||
note_id = str(item.get("tid") or "")
|
||||
if not note_id:
|
||||
continue
|
||||
user = item.get("user") or {}
|
||||
tieba_name = self._ensure_tieba_suffix(item.get("forum_name") or "")
|
||||
tieba_note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=self._normalize_text(item.get("title") or ""),
|
||||
desc=self._normalize_text(item.get("content") or ""),
|
||||
note_url=f"{const.TIEBA_URL}/p/{note_id}",
|
||||
publish_time=utils.get_time_str_from_unix_time(
|
||||
item.get("time") or item.get("create_time") or 0
|
||||
),
|
||||
user_link="",
|
||||
user_nickname=user.get("show_nickname") or user.get("user_name") or "",
|
||||
user_avatar=user.get("portrait") or user.get("portraith") or "",
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=self._tieba_link_from_name(tieba_name),
|
||||
total_replay_num=item.get("post_num") or 0,
|
||||
)
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
def extract_note_detail_from_api(self, api_data: Dict) -> TiebaNote:
|
||||
"""
|
||||
Extract Tieba post detail from current PC page_pc JSON API.
|
||||
"""
|
||||
thread = api_data.get("thread") or {}
|
||||
first_floor = api_data.get("first_floor") or {}
|
||||
forum = api_data.get("forum") or api_data.get("display_forum") or {}
|
||||
page = api_data.get("page") or {}
|
||||
user_map = self._api_user_map(api_data)
|
||||
author = user_map.get(str(first_floor.get("author_id"))) or {}
|
||||
note_id = str(thread.get("id") or thread.get("tid") or first_floor.get("tid") or "")
|
||||
tieba_name = self._ensure_tieba_suffix(forum.get("name") or "")
|
||||
note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=self._clean_title(thread.get("title") or first_floor.get("title") or "", tieba_name),
|
||||
desc=self._extract_api_content_text(
|
||||
first_floor.get("content")
|
||||
or thread.get("origin_thread_info", {}).get("abstract")
|
||||
or thread.get("origin_thread_info", {}).get("content")
|
||||
),
|
||||
note_url=f"{const.TIEBA_URL}/p/{note_id}",
|
||||
publish_time=utils.get_time_str_from_unix_time(
|
||||
first_floor.get("time") or thread.get("create_time") or 0
|
||||
),
|
||||
user_link=self._api_user_link(author),
|
||||
user_nickname=author.get("name_show") or author.get("name") or "",
|
||||
user_avatar=self._api_user_avatar(author),
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=self._tieba_link_from_name(tieba_name),
|
||||
total_replay_num=thread.get("reply_num") or 0,
|
||||
total_replay_page=page.get("total_page") or 0,
|
||||
ip_location=author.get("ip_address") or "",
|
||||
)
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parent_comments_from_api(
|
||||
self, api_data: Dict, note_detail: TiebaNote
|
||||
) -> List[TiebaComment]:
|
||||
"""
|
||||
Extract first-level comments from current PC page_pc JSON API.
|
||||
"""
|
||||
forum = api_data.get("forum") or api_data.get("display_forum") or {}
|
||||
tieba_id = str(forum.get("id") or "")
|
||||
tieba_name = note_detail.tieba_name or self._ensure_tieba_suffix(forum.get("name") or "")
|
||||
tieba_link = note_detail.tieba_link or self._tieba_link_from_name(tieba_name)
|
||||
user_map = self._api_user_map(api_data)
|
||||
result: List[TiebaComment] = []
|
||||
for item in api_data.get("post_list", []):
|
||||
comment_id = str(item.get("id") or "")
|
||||
if not comment_id:
|
||||
continue
|
||||
user = user_map.get(str(item.get("author_id"))) or {}
|
||||
comment = TiebaComment(
|
||||
comment_id=comment_id,
|
||||
sub_comment_count=item.get("sub_post_number") or 0,
|
||||
content=self._extract_api_content_text(item.get("content")),
|
||||
note_url=note_detail.note_url,
|
||||
user_link=self._api_user_link(user),
|
||||
user_nickname=user.get("name_show") or user.get("name") or "",
|
||||
user_avatar=self._api_user_avatar(user),
|
||||
tieba_id=tieba_id,
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=tieba_link,
|
||||
ip_location=user.get("ip_address") or "",
|
||||
publish_time=utils.get_time_str_from_unix_time(item.get("time") or 0),
|
||||
note_id=note_detail.note_id,
|
||||
)
|
||||
result.append(comment)
|
||||
return result
|
||||
|
||||
def extract_creator_info_from_api(self, api_data: Dict) -> TiebaCreator:
|
||||
"""
|
||||
Extract Tieba creator information from current PC creator JSON API.
|
||||
"""
|
||||
user = api_data.get("data", {}).get("user", {})
|
||||
if not user:
|
||||
raise ValueError(f"Creator API response does not contain user info: {api_data}")
|
||||
gender_value = user.get("sex", user.get("gender", 0))
|
||||
gender = "Unknown"
|
||||
if gender_value == 1:
|
||||
gender = "Male"
|
||||
elif gender_value == 2:
|
||||
gender = "Female"
|
||||
|
||||
return TiebaCreator(
|
||||
user_id=str(user.get("id", "")),
|
||||
user_name=str(user.get("name", "")),
|
||||
nickname=str(user.get("name_show") or user.get("name") or ""),
|
||||
avatar=self._api_user_avatar(user),
|
||||
gender=gender,
|
||||
ip_location=str(user.get("ip_address", "")),
|
||||
follows=int(user.get("concern_num") or 0),
|
||||
fans=int(user.get("fans_num") or 0),
|
||||
registration_duration=str(user.get("tb_age", "")),
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def extract_creator_thread_id_list_from_api(api_data: Dict) -> List[str]:
|
||||
"""
|
||||
Extract creator thread ids from current PC creator feed JSON API.
|
||||
"""
|
||||
thread_ids: List[str] = []
|
||||
for item in api_data.get("data", {}).get("list", []):
|
||||
thread_info = item.get("thread_info") or {}
|
||||
thread_id = thread_info.get("tid") or thread_info.get("id")
|
||||
if thread_id:
|
||||
thread_ids.append(str(thread_id))
|
||||
return thread_ids
|
||||
|
||||
def extract_tieba_note_list_from_frs_api(self, api_data: Dict) -> List[TiebaNote]:
|
||||
"""
|
||||
Extract Tieba thread ids from current PC forum page JSON API.
|
||||
|
||||
The by-forum command immediately fetches full details for every id, so
|
||||
this list intentionally carries only stable routing fields.
|
||||
"""
|
||||
forum = api_data.get("forum", {})
|
||||
tieba_name = self._ensure_tieba_suffix(forum.get("name") or "")
|
||||
tieba_link = self._tieba_link_from_name(tieba_name)
|
||||
tids = [
|
||||
tid.strip()
|
||||
for tid in str(forum.get("tids") or "").split(",")
|
||||
if tid.strip()
|
||||
]
|
||||
return [
|
||||
TiebaNote(
|
||||
note_id=tid,
|
||||
title="",
|
||||
desc="",
|
||||
note_url=f"{const.TIEBA_URL}/p/{tid}",
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=tieba_link,
|
||||
)
|
||||
for tid in tids
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def _decode_js_string(value: str) -> str:
|
||||
if not value or value == "null":
|
||||
return ""
|
||||
try:
|
||||
decoded_value = json.loads(f'"{value}"')
|
||||
return decoded_value if isinstance(decoded_value, str) else str(decoded_value)
|
||||
except Exception:
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def _extract_forum_info(cls, selector: Selector, page_content: str) -> Tuple[str, str]:
|
||||
forum_xpath = f"//a[{cls._class_contains('card_title_fname')}]"
|
||||
forum_link_selector = selector.xpath(forum_xpath)
|
||||
tieba_name = cls._selector_text(selector, forum_xpath)
|
||||
tieba_link = cls._absolute_url(forum_link_selector.xpath("./@href").get(default=""))
|
||||
|
||||
if not tieba_name:
|
||||
patterns = [
|
||||
r"PageData\.forum\s*=\s*\{.*?['\"]name['\"]\s*:\s*\"([^\"\\\\]*(?:\\\\.[^\"\\\\]*)*)\"",
|
||||
r'"forum_name"\s*:\s*"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"',
|
||||
r'"kw"\s*:\s*"([^"\\\\]*(?:\\\\.[^"\\\\]*)*)"',
|
||||
]
|
||||
for pattern in patterns:
|
||||
match = re.search(pattern, page_content, re.S)
|
||||
if match:
|
||||
tieba_name = cls._decode_js_string(match.group(1))
|
||||
if tieba_name:
|
||||
break
|
||||
|
||||
if not tieba_name:
|
||||
title = selector.xpath("//title/text()").get(default="")
|
||||
match = re.search(r"(.+?)吧[-_]", title)
|
||||
if match:
|
||||
tieba_name = cls._normalize_text(match.group(1))
|
||||
|
||||
if not tieba_link and tieba_name:
|
||||
tieba_link = f"{const.TIEBA_URL}/f?kw={quote(tieba_name.removesuffix('吧'))}"
|
||||
|
||||
return tieba_name, tieba_link or const.TIEBA_URL
|
||||
|
||||
@classmethod
|
||||
def _clean_title(cls, title: str, tieba_name: str = "") -> str:
|
||||
title = cls._normalize_text(title)
|
||||
title = re.sub(r"_(?:百度贴吧|Baidu Tieba)$", "", title).strip()
|
||||
for name in {tieba_name, tieba_name.removesuffix("吧")}:
|
||||
if name:
|
||||
title = title.replace(f"【{name}】", "").strip()
|
||||
return title
|
||||
|
||||
@staticmethod
|
||||
def extract_search_note_list(page_content: str) -> List[TiebaNote]:
|
||||
"""
|
||||
@@ -49,23 +349,115 @@ class TieBaExtractor:
|
||||
Returns:
|
||||
List of Tieba post objects
|
||||
"""
|
||||
xpath_selector = "//div[@class='s_post']"
|
||||
post_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
extractor = TieBaExtractor()
|
||||
selector = Selector(text=page_content)
|
||||
post_list = selector.xpath(
|
||||
f"//div[{extractor._class_contains('s_post')}]"
|
||||
)
|
||||
result: List[TiebaNote] = []
|
||||
for post in post_list:
|
||||
tieba_note = TiebaNote(note_id=post.xpath(".//span[@class='p_title']/a/@data-tid").get(default='').strip(),
|
||||
title=post.xpath(".//span[@class='p_title']/a/text()").get(default='').strip(),
|
||||
desc=post.xpath(".//div[@class='p_content']/text()").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + post.xpath(".//span[@class='p_title']/a/@href").get(
|
||||
default=''),
|
||||
user_nickname=post.xpath(".//a[starts-with(@href, '/home/main')]/font/text()").get(
|
||||
default='').strip(), user_link=const.TIEBA_URL + post.xpath(
|
||||
".//a[starts-with(@href, '/home/main')]/@href").get(default=''),
|
||||
tieba_name=post.xpath(".//a[@class='p_forum']/font/text()").get(default='').strip(),
|
||||
tieba_link=const.TIEBA_URL + post.xpath(".//a[@class='p_forum']/@href").get(
|
||||
default=''),
|
||||
publish_time=post.xpath(".//font[@class='p_green p_date']/text()").get(
|
||||
default='').strip(), )
|
||||
title_link = post.xpath(".//*[contains(@class, 'p_title')]//a[1]")
|
||||
note_url = extractor._absolute_url(title_link.xpath("./@href").get(default=""))
|
||||
note_id = title_link.xpath("./@data-tid").get(default="").strip()
|
||||
if not note_id:
|
||||
note_id = extractor._extract_note_id_from_url(note_url)
|
||||
user_selector = post.xpath(".//a[contains(@href, '/home/main')][1]")
|
||||
forum_selector = post.xpath(f".//a[{extractor._class_contains('p_forum')}][1]")
|
||||
tieba_note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=extractor._selector_text(post, ".//*[contains(@class, 'p_title')]//a[1]"),
|
||||
desc=extractor._selector_text(
|
||||
post, f".//div[{extractor._class_contains('p_content')}]"
|
||||
),
|
||||
note_url=note_url,
|
||||
user_nickname=extractor._selector_text(
|
||||
post, ".//a[contains(@href, '/home/main')][1]"
|
||||
),
|
||||
user_link=extractor._absolute_url(user_selector.xpath("./@href").get(default="")),
|
||||
tieba_name=extractor._selector_text(
|
||||
post, f".//a[{extractor._class_contains('p_forum')}][1]"
|
||||
),
|
||||
tieba_link=extractor._absolute_url(forum_selector.xpath("./@href").get(default="")),
|
||||
publish_time=extractor._selector_text(
|
||||
post, ".//*[contains(@class, 'p_date')][1]"
|
||||
),
|
||||
)
|
||||
result.append(tieba_note)
|
||||
if result:
|
||||
return result
|
||||
|
||||
# Tieba search changed to a PC feed/card layout in 2026. The old
|
||||
# s_post nodes disappeared, while each search result now lives in a
|
||||
# threadcardclass card with overlay links to /p/<thread_id>.
|
||||
post_list = selector.xpath(
|
||||
f"//*[contains(concat(' ', normalize-space(@class), ' '), ' threadcardclass ') "
|
||||
f"and .//a[contains(@href, '/p/')]]"
|
||||
)
|
||||
seen_note_ids = set()
|
||||
for post in post_list:
|
||||
title_link = post.xpath(
|
||||
f".//a[{extractor._class_contains('action-link-bg')} and contains(@href, '/p/')][1]"
|
||||
f"|.//a[contains(@href, '/p/')][1]"
|
||||
)
|
||||
note_url = extractor._absolute_url(title_link.xpath("./@href").get(default=""))
|
||||
note_id = extractor._extract_note_id_from_url(note_url)
|
||||
if not note_id or note_id in seen_note_ids:
|
||||
continue
|
||||
seen_note_ids.add(note_id)
|
||||
|
||||
tieba_name = extractor._selector_text(
|
||||
post, f".//*[{extractor._class_contains('forum-name-text')}][1]"
|
||||
)
|
||||
tieba_link = ""
|
||||
forum_link = post.xpath(".//a[contains(@href, '/f?')][1]/@href").get(default="")
|
||||
if forum_link:
|
||||
tieba_link = extractor._absolute_url(forum_link)
|
||||
elif tieba_name:
|
||||
tieba_keyword = tieba_name.removesuffix("吧")
|
||||
tieba_link = f"{const.TIEBA_URL}/f?kw={quote(tieba_keyword)}"
|
||||
else:
|
||||
tieba_link = const.TIEBA_URL
|
||||
|
||||
publish_time = ""
|
||||
top_title_text = extractor._selector_text(
|
||||
post, f".//*[{extractor._class_contains('top-title')}][1]"
|
||||
)
|
||||
publish_match = re.search(r"发布于\s*([^\s]+)", top_title_text)
|
||||
if publish_match:
|
||||
publish_time = publish_match.group(1)
|
||||
|
||||
title = extractor._selector_text(
|
||||
post, f".//*[{extractor._class_contains('title-wrap')}][1]"
|
||||
)
|
||||
desc = extractor._selector_text(
|
||||
post, f".//*[{extractor._class_contains('abstract-wrap')}][1]"
|
||||
)
|
||||
if not title:
|
||||
title = extractor._normalize_text(desc[:80])
|
||||
|
||||
user_nickname = extractor._selector_text(
|
||||
post, f".//*[{extractor._class_contains('forum-attention')}][1]"
|
||||
)
|
||||
if not user_nickname and publish_time:
|
||||
user_nickname = extractor._normalize_text(
|
||||
top_title_text.split("发布于", 1)[0]
|
||||
)
|
||||
|
||||
comment_text = extractor._selector_text(
|
||||
post, f".//a[{extractor._class_contains('comment-link-zone')}][1]"
|
||||
)
|
||||
tieba_note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=title,
|
||||
desc=desc,
|
||||
note_url=f"{const.TIEBA_URL}/p/{note_id}",
|
||||
user_nickname=user_nickname,
|
||||
user_link="",
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=tieba_link,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=extractor._text_to_int(comment_text),
|
||||
)
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
@@ -80,27 +472,39 @@ class TieBaExtractor:
|
||||
"""
|
||||
page_content = page_content.replace('<!--', "")
|
||||
content_selector = Selector(text=page_content)
|
||||
xpath_selector = "//ul[@id='thread_list']/li"
|
||||
xpath_selector = f"//ul[@id='thread_list']/li[{self._class_contains('j_thread_list')}]"
|
||||
post_list = content_selector.xpath(xpath_selector)
|
||||
tieba_name, tieba_link = self._extract_forum_info(content_selector, page_content)
|
||||
result: List[TiebaNote] = []
|
||||
for post_selector in post_list:
|
||||
post_field_value: Dict = self.extract_data_field_value(post_selector)
|
||||
if not post_field_value:
|
||||
continue
|
||||
note_id = str(post_field_value.get("id"))
|
||||
tieba_note = TiebaNote(note_id=note_id,
|
||||
title=post_selector.xpath(".//a[@class='j_th_tit ']/text()").get(default='').strip(),
|
||||
desc=post_selector.xpath(
|
||||
".//div[@class='threadlist_abs threadlist_abs_onlyline ']/text()").get(
|
||||
default='').strip(), note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + post_selector.xpath(
|
||||
".//a[@class='frs-author-name j_user_card ']/@href").get(default='').strip(),
|
||||
user_nickname=post_field_value.get("authoer_nickname") or post_field_value.get(
|
||||
"author_name"),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''),
|
||||
total_replay_num=post_field_value.get("reply_num", 0))
|
||||
user_selector = post_selector.xpath(f".//a[{self._class_contains('frs-author-name')}][1]")
|
||||
title = self._selector_text(post_selector, f".//a[{self._class_contains('j_th_tit')}][1]")
|
||||
if not title:
|
||||
title = self._selector_text(post_selector, f".//*[{self._class_contains('threadlist_title')}]//a[1]")
|
||||
user_nickname = (
|
||||
post_field_value.get("author_nickname")
|
||||
or post_field_value.get("author_name")
|
||||
or self._selector_text(
|
||||
post_selector, f".//a[{self._class_contains('frs-author-name')}][1]"
|
||||
)
|
||||
)
|
||||
tieba_note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=title,
|
||||
desc=self._selector_text(
|
||||
post_selector, f".//div[{self._class_contains('threadlist_abs')}]"
|
||||
),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=self._absolute_url(user_selector.xpath("./@href").get(default="")),
|
||||
user_nickname=user_nickname,
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=tieba_link,
|
||||
total_replay_num=post_field_value.get("reply_num", 0),
|
||||
)
|
||||
result.append(tieba_note)
|
||||
return result
|
||||
|
||||
@@ -114,31 +518,59 @@ class TieBaExtractor:
|
||||
Tieba post detail object
|
||||
"""
|
||||
content_selector = Selector(text=page_content)
|
||||
first_floor_selector = content_selector.xpath("//div[@class='p_postlist'][1]")
|
||||
first_floor_selector = content_selector.xpath(
|
||||
f"//div[{self._class_contains('l_post')} and {self._class_contains('j_l_post')}][1]"
|
||||
)
|
||||
only_view_author_link = content_selector.xpath("//*[@id='lzonly_cntn']/@href").get(default='').strip()
|
||||
note_id = only_view_author_link.split("?")[0].split("/")[-1]
|
||||
if not note_id:
|
||||
note_id_match = re.search(r'"thread_id"\s*:\s*"?(\d+)"?', page_content)
|
||||
note_id = note_id_match.group(1) if note_id_match else ""
|
||||
# Post reply count and reply page count
|
||||
thread_num_infos = content_selector.xpath(
|
||||
"//div[@id='thread_theme_5']//li[@class='l_reply_num']//span[@class='red']")
|
||||
f"//div[@id='thread_theme_5']//li[{self._class_contains('l_reply_num')}]"
|
||||
f"//span[{self._class_contains('red')}]"
|
||||
)
|
||||
# IP location and publish time
|
||||
other_info_content = content_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
other_info_content = first_floor_selector.xpath(
|
||||
f".//div[{self._class_contains('post-tail-wrap')}]"
|
||||
).get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
note = TiebaNote(note_id=note_id, title=content_selector.xpath("//title/text()").get(default='').strip(),
|
||||
desc=content_selector.xpath("//meta[@name='description']/@content").get(default='').strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + first_floor_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=first_floor_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=first_floor_selector.xpath(".//a[@class='p_author_face ']/img/@src").get(
|
||||
default='').strip(),
|
||||
tieba_name=content_selector.xpath("//a[@class='card_title_fname']/text()").get(
|
||||
default='').strip(), tieba_link=const.TIEBA_URL + content_selector.xpath(
|
||||
"//a[@class='card_title_fname']/@href").get(default=''), ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=thread_num_infos[0].xpath("./text()").get(default='').strip(),
|
||||
total_replay_page=thread_num_infos[1].xpath("./text()").get(default='').strip(), )
|
||||
note.title = note.title.replace(f"【{note.tieba_name}】_Baidu Tieba", "")
|
||||
tieba_name, tieba_link = self._extract_forum_info(content_selector, page_content)
|
||||
first_floor_value = self.extract_data_field_value(first_floor_selector)
|
||||
author_value = first_floor_value.get("author", {}) if first_floor_value else {}
|
||||
author_link = first_floor_selector.xpath(
|
||||
f".//a[{self._class_contains('p_author_face')} "
|
||||
f"or {self._class_contains('p_author_name')}]/@href"
|
||||
).get(default="")
|
||||
note = TiebaNote(
|
||||
note_id=note_id,
|
||||
title=content_selector.xpath("//title/text()").get(default="").strip(),
|
||||
desc=content_selector.xpath("//meta[@name='description']/@content").get(default="").strip(),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=self._absolute_url(author_link),
|
||||
user_nickname=(
|
||||
self._selector_text(first_floor_selector, f".//a[{self._class_contains('p_author_name')}][1]")
|
||||
or author_value.get("user_nickname")
|
||||
or author_value.get("user_name", "")
|
||||
),
|
||||
user_avatar=first_floor_selector.xpath(
|
||||
f".//a[{self._class_contains('p_author_face')}]//img/@src"
|
||||
).get(default="").strip(),
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=tieba_link,
|
||||
ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
total_replay_num=(
|
||||
thread_num_infos[0].xpath("./text()").get(default="0").strip()
|
||||
if len(thread_num_infos) > 0 else 0
|
||||
),
|
||||
total_replay_page=(
|
||||
thread_num_infos[1].xpath("./text()").get(default="0").strip()
|
||||
if len(thread_num_infos) > 1 else 0
|
||||
),
|
||||
)
|
||||
note.title = self._clean_title(note.title, note.tieba_name)
|
||||
return note
|
||||
|
||||
def extract_tieba_note_parment_comments(self, page_content: str, note_id: str) -> List[TiebaComment]:
|
||||
@@ -151,30 +583,56 @@ class TieBaExtractor:
|
||||
Returns:
|
||||
List of first-level comment objects
|
||||
"""
|
||||
xpath_selector = "//div[@class='l_post l_post_bright j_l_post clearfix ']"
|
||||
xpath_selector = f"//div[{self._class_contains('l_post')} and {self._class_contains('j_l_post')}]"
|
||||
comment_list = Selector(text=page_content).xpath(xpath_selector)
|
||||
content_selector = Selector(text=page_content)
|
||||
tieba_name, tieba_link = self._extract_forum_info(content_selector, page_content)
|
||||
result: List[TiebaComment] = []
|
||||
for comment_selector in comment_list:
|
||||
comment_field_value: Dict = self.extract_data_field_value(comment_selector)
|
||||
if not comment_field_value:
|
||||
comment_content_value = comment_field_value.get("content", {}) if comment_field_value else {}
|
||||
if not comment_content_value:
|
||||
continue
|
||||
tieba_name = comment_selector.xpath("//a[@class='card_title_fname']/text()").get(default='').strip()
|
||||
other_info_content = comment_selector.xpath(".//div[@class='post-tail-wrap']").get(default="").strip()
|
||||
other_info_content = comment_selector.xpath(
|
||||
f".//div[{self._class_contains('post-tail-wrap')}]"
|
||||
).get(default="").strip()
|
||||
ip_location, publish_time = self.extract_ip_and_pub_time(other_info_content)
|
||||
tieba_comment = TiebaComment(comment_id=str(comment_field_value.get("content").get("post_id")),
|
||||
sub_comment_count=comment_field_value.get("content").get("comment_num"),
|
||||
content=utils.extract_text_from_html(
|
||||
comment_field_value.get("content").get("content")),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=const.TIEBA_URL + comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/@href").get(default='').strip(),
|
||||
user_nickname=comment_selector.xpath(
|
||||
".//a[@class='p_author_name j_user_card']/text()").get(default='').strip(),
|
||||
user_avatar=comment_selector.xpath(
|
||||
".//a[@class='p_author_face ']/img/@src").get(default='').strip(),
|
||||
tieba_id=str(comment_field_value.get("content").get("forum_id", "")),
|
||||
tieba_name=tieba_name, tieba_link=f"https://tieba.baidu.com/f?kw={tieba_name}",
|
||||
ip_location=ip_location, publish_time=publish_time, note_id=note_id, )
|
||||
user_selector = comment_selector.xpath(f".//a[{self._class_contains('p_author_name')}][1]")
|
||||
user_avatar = comment_selector.xpath(
|
||||
f".//a[{self._class_contains('p_author_face')}]//img/@src"
|
||||
).get(default="").strip()
|
||||
if not user_avatar and comment_field_value.get("author", {}).get("portrait"):
|
||||
portrait = comment_field_value["author"]["portrait"]
|
||||
user_avatar = (
|
||||
"https://gss0.bdstatic.com/6LZ1dD3d1sgCo2Kml5_Y_D3/sys/portrait/item/"
|
||||
f"{portrait}"
|
||||
)
|
||||
content_html = comment_content_value.get("content") or comment_selector.xpath(
|
||||
f".//div[{self._class_contains('d_post_content')}]"
|
||||
).get(default="")
|
||||
user_nickname = (
|
||||
self._selector_text(comment_selector, f".//a[{self._class_contains('p_author_name')}][1]")
|
||||
or comment_field_value.get("author", {}).get("user_nickname")
|
||||
or comment_field_value.get("author", {}).get("user_name", "")
|
||||
)
|
||||
tieba_comment = TiebaComment(
|
||||
comment_id=str(
|
||||
comment_content_value.get("post_id")
|
||||
or comment_selector.xpath("./@data-pid").get(default="")
|
||||
),
|
||||
sub_comment_count=comment_content_value.get("comment_num") or 0,
|
||||
content=utils.extract_text_from_html(content_html),
|
||||
note_url=const.TIEBA_URL + f"/p/{note_id}",
|
||||
user_link=self._absolute_url(user_selector.xpath("./@href").get(default="")),
|
||||
user_nickname=user_nickname,
|
||||
user_avatar=user_avatar,
|
||||
tieba_id=str(comment_content_value.get("forum_id", "")),
|
||||
tieba_name=tieba_name,
|
||||
tieba_link=tieba_link,
|
||||
ip_location=ip_location,
|
||||
publish_time=publish_time,
|
||||
note_id=note_id,
|
||||
)
|
||||
result.append(tieba_comment)
|
||||
return result
|
||||
|
||||
@@ -190,21 +648,24 @@ class TieBaExtractor:
|
||||
"""
|
||||
selector = Selector(page_content)
|
||||
comments = []
|
||||
comment_ele_list = selector.xpath("//li[@class='lzl_single_post j_lzl_s_p first_no_border']")
|
||||
comment_ele_list.extend(selector.xpath("//li[@class='lzl_single_post j_lzl_s_p ']"))
|
||||
comment_ele_list = selector.xpath(
|
||||
f"//li[{self._class_contains('lzl_single_post')} and {self._class_contains('j_lzl_s_p')}]"
|
||||
)
|
||||
for comment_ele in comment_ele_list:
|
||||
comment_value = self.extract_data_field_value(comment_ele)
|
||||
if not comment_value:
|
||||
continue
|
||||
comment_user_a_selector = comment_ele.xpath("./a[@class='j_user_card lzl_p_p']")[0]
|
||||
comment_user_a_selector = comment_ele.xpath(
|
||||
f"./a[{self._class_contains('j_user_card')} and {self._class_contains('lzl_p_p')}][1]"
|
||||
)
|
||||
content = utils.extract_text_from_html(
|
||||
comment_ele.xpath(".//span[@class='lzl_content_main']").get(default=""))
|
||||
comment_ele.xpath(f".//span[{self._class_contains('lzl_content_main')}]").get(default=""))
|
||||
comment = TiebaComment(
|
||||
comment_id=str(comment_value.get("spid")), content=content,
|
||||
user_link=comment_user_a_selector.xpath("./@href").get(default=""),
|
||||
user_nickname=comment_value.get("showname"),
|
||||
user_link=self._absolute_url(comment_user_a_selector.xpath("./@href").get(default="")),
|
||||
user_nickname=str(comment_value.get("showname") or ""),
|
||||
user_avatar=comment_user_a_selector.xpath("./img/@src").get(default=""),
|
||||
publish_time=comment_ele.xpath(".//span[@class='lzl_time']/text()").get(default="").strip(),
|
||||
publish_time=self._selector_text(comment_ele, f".//span[{self._class_contains('lzl_time')}]"),
|
||||
parent_comment_id=parent_comment.comment_id,
|
||||
note_id=parent_comment.note_id, note_url=parent_comment.note_url,
|
||||
tieba_id=parent_comment.tieba_id, tieba_name=parent_comment.tieba_name,
|
||||
|
||||
62
tests/test_cmd_arg_tieba.py
Normal file
62
tests/test_cmd_arg_tieba.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import config
|
||||
import pytest
|
||||
from cmd_arg import parse_cmd
|
||||
from media_platform.tieba import TieBaCrawler
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_detail_cli_sets_specified_ids():
|
||||
await parse_cmd(
|
||||
[
|
||||
"--platform",
|
||||
"tieba",
|
||||
"--type",
|
||||
"detail",
|
||||
"--specified_id",
|
||||
"https://tieba.baidu.com/p/10451142633,9835114923",
|
||||
]
|
||||
)
|
||||
|
||||
assert config.TIEBA_SPECIFIED_ID_LIST == ["10451142633", "9835114923"]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_creator_cli_sets_creator_urls():
|
||||
await parse_cmd(
|
||||
[
|
||||
"--platform",
|
||||
"tieba",
|
||||
"--type",
|
||||
"creator",
|
||||
"--creator_id",
|
||||
"tb.1.example,https://tieba.baidu.com/home/main?id=tb.1.raw",
|
||||
]
|
||||
)
|
||||
|
||||
assert config.TIEBA_CREATOR_URL_LIST == [
|
||||
"https://tieba.baidu.com/home/main?id=tb.1.example",
|
||||
"https://tieba.baidu.com/home/main?id=tb.1.raw",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_tieba_detail_reads_runtime_specified_ids(monkeypatch):
|
||||
crawler = TieBaCrawler()
|
||||
seen_note_ids = []
|
||||
|
||||
async def fake_get_note_detail(note_id, semaphore):
|
||||
seen_note_ids.append(note_id)
|
||||
return None
|
||||
|
||||
async def fake_batch_get_comments(note_details):
|
||||
return None
|
||||
|
||||
monkeypatch.setattr(config, "TIEBA_SPECIFIED_ID_LIST", ["10451142633"])
|
||||
monkeypatch.setattr(crawler, "get_note_detail_async_task", fake_get_note_detail)
|
||||
monkeypatch.setattr(crawler, "batch_get_note_comments", fake_batch_get_comments)
|
||||
|
||||
await crawler.get_specified_notes()
|
||||
|
||||
assert seen_note_ids == ["10451142633"]
|
||||
110
tests/test_tieba_client_pagination.py
Normal file
110
tests/test_tieba_client_pagination.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
import pytest
|
||||
|
||||
from media_platform.tieba.client import BaiduTieBaClient
|
||||
from model.m_baidu_tieba import TiebaComment, TiebaNote
|
||||
|
||||
|
||||
class DummyPage:
|
||||
url = "https://tieba.baidu.com/"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_search_uses_requested_page_number():
|
||||
client = BaiduTieBaClient(playwright_page=DummyPage())
|
||||
calls = []
|
||||
|
||||
async def fake_fetch(uri, method="GET", params=None, data=None, use_sign=False):
|
||||
calls.append((uri, params))
|
||||
return {"no": 0, "data": {"card_list": []}}
|
||||
|
||||
client._fetch_json_by_browser = fake_fetch
|
||||
|
||||
await client.get_notes_by_keyword("编程兼职", page=2, page_size=10)
|
||||
|
||||
assert calls[0][0] == "/mo/q/search/multsearch"
|
||||
assert calls[0][1]["pn"] == 2
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_comments_walk_pages_until_total_reply_page():
|
||||
client = BaiduTieBaClient(playwright_page=DummyPage())
|
||||
pages = []
|
||||
note = TiebaNote(
|
||||
note_id="9835114923",
|
||||
title="title",
|
||||
note_url="https://tieba.baidu.com/p/9835114923",
|
||||
tieba_name="加工中心吧",
|
||||
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
|
||||
total_replay_page=2,
|
||||
)
|
||||
|
||||
async def fake_get_page_data(note_id, page=1):
|
||||
pages.append(page)
|
||||
return {"forum": {"id": 1, "name": "加工中心"}, "post_list": []}
|
||||
|
||||
def fake_extract_comments(api_data, note_detail):
|
||||
page = pages[-1]
|
||||
return [
|
||||
TiebaComment(
|
||||
comment_id=str(page),
|
||||
content="comment",
|
||||
note_id=note_detail.note_id,
|
||||
note_url=note_detail.note_url,
|
||||
tieba_id="1",
|
||||
tieba_name=note_detail.tieba_name,
|
||||
tieba_link=note_detail.tieba_link,
|
||||
)
|
||||
]
|
||||
|
||||
client._get_pc_page_data = fake_get_page_data
|
||||
client._page_extractor.extract_tieba_note_parent_comments_from_api = fake_extract_comments
|
||||
|
||||
await client.get_note_all_comments(note, crawl_interval=0, max_count=10)
|
||||
|
||||
assert pages == [1, 2]
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_creator_feed_walks_until_has_more_false(monkeypatch):
|
||||
client = BaiduTieBaClient(playwright_page=DummyPage())
|
||||
pages = []
|
||||
|
||||
async def fake_get_notes_by_creator_portrait(portrait, page_number, page_size=20):
|
||||
pages.append(page_number)
|
||||
return {
|
||||
"error_code": 0,
|
||||
"data": {
|
||||
"has_more": 1 if page_number == 1 else 0,
|
||||
"list": [
|
||||
{
|
||||
"thread_info": {
|
||||
"id": str(1000 + page_number),
|
||||
"tid": str(1000 + page_number),
|
||||
}
|
||||
}
|
||||
],
|
||||
},
|
||||
}
|
||||
|
||||
async def fake_get_note_by_id(note_id):
|
||||
return TiebaNote(
|
||||
note_id=note_id,
|
||||
title="title",
|
||||
note_url=f"https://tieba.baidu.com/p/{note_id}",
|
||||
tieba_name="加工中心吧",
|
||||
tieba_link="https://tieba.baidu.com/f?kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83",
|
||||
)
|
||||
|
||||
async def fake_sleep(_):
|
||||
return None
|
||||
|
||||
client.get_notes_by_creator_portrait = fake_get_notes_by_creator_portrait
|
||||
client.get_note_by_id = fake_get_note_by_id
|
||||
monkeypatch.setattr("media_platform.tieba.client.asyncio.sleep", fake_sleep)
|
||||
|
||||
notes = await client.get_all_notes_by_creator_url("tb.1.creator", crawl_interval=0)
|
||||
|
||||
assert pages == [1, 2]
|
||||
assert [note.note_id for note in notes] == ["1001", "1002"]
|
||||
278
tests/test_tieba_extractor.py
Normal file
278
tests/test_tieba_extractor.py
Normal file
@@ -0,0 +1,278 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from media_platform.tieba.help import TieBaExtractor
|
||||
from model.m_baidu_tieba import TiebaComment
|
||||
|
||||
|
||||
FIXTURE_DIR = Path(__file__).parent.parent / "media_platform" / "tieba" / "test_data"
|
||||
|
||||
|
||||
def read_fixture(name: str) -> str:
|
||||
return (FIXTURE_DIR / name).read_text(encoding="utf-8")
|
||||
|
||||
|
||||
def test_extract_search_note_list_from_keyword_page():
|
||||
notes = TieBaExtractor.extract_search_note_list(read_fixture("search_keyword_notes.html"))
|
||||
|
||||
assert len(notes) == 10
|
||||
assert notes[0].note_id == "9117888152"
|
||||
assert notes[0].title.startswith("武汉交互空间科技")
|
||||
assert notes[0].tieba_name == "武汉交互空间"
|
||||
assert notes[0].user_nickname == "VR虚拟达人"
|
||||
|
||||
|
||||
def test_extract_search_note_list_from_current_pc_card_page():
|
||||
page_content = """
|
||||
<html>
|
||||
<body>
|
||||
<div class="threadcardclass thread-new3 index-feed-cards">
|
||||
<a class="action-link-bg" href="https://tieba.baidu.com/p/10559655942?fr=undefined"></a>
|
||||
<div class="thread-forum-name display-flex align-center">
|
||||
<span class="forum-name-text">诸城吧</span>
|
||||
</div>
|
||||
<div class="top-title">
|
||||
<span class="forum-attention user">754023117</span>
|
||||
<span>发布于 2026-3-15</span>
|
||||
</div>
|
||||
<div class="title-wrap"><span>数,英,编程老师</span></div>
|
||||
<div class="abstract-wrap">
|
||||
<span>培训班需求,数学,英语,编程老师,专职兼职都可</span>
|
||||
</div>
|
||||
<a class="comment-link-zone" href="https://tieba.baidu.com/p/10559655942?showComment=1">
|
||||
<span class="action-number">19</span>
|
||||
</a>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
notes = TieBaExtractor.extract_search_note_list(page_content)
|
||||
|
||||
assert len(notes) == 1
|
||||
assert notes[0].note_id == "10559655942"
|
||||
assert notes[0].title == "数,英,编程老师"
|
||||
assert notes[0].desc == "培训班需求,数学,英语,编程老师,专职兼职都可"
|
||||
assert notes[0].tieba_name == "诸城吧"
|
||||
assert notes[0].tieba_link.endswith("kw=%E8%AF%B8%E5%9F%8E")
|
||||
assert notes[0].user_nickname == "754023117"
|
||||
assert notes[0].publish_time == "2026-3-15"
|
||||
assert notes[0].total_replay_num == 19
|
||||
|
||||
|
||||
def test_extract_search_note_list_from_current_pc_api():
|
||||
api_data = {
|
||||
"no": 0,
|
||||
"error": "success",
|
||||
"data": {
|
||||
"card_list": [
|
||||
{"cardInfo": "related_user", "cardStyle": "related_user", "data": {}},
|
||||
{
|
||||
"cardInfo": "thread",
|
||||
"cardStyle": "thread",
|
||||
"data": {
|
||||
"tid": "10559655942",
|
||||
"title": "数,英,编程老师",
|
||||
"content": "培训班需求,数学,英语,编程老师,专职兼职都可",
|
||||
"time": 1773552643,
|
||||
"user": {
|
||||
"show_nickname": "754023117",
|
||||
"portrait": "https://example.com/avatar.jpg",
|
||||
},
|
||||
"post_num": 19,
|
||||
"forum_name": "诸城",
|
||||
},
|
||||
},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
notes = TieBaExtractor().extract_search_note_list_from_api(api_data)
|
||||
|
||||
assert len(notes) == 1
|
||||
assert notes[0].note_id == "10559655942"
|
||||
assert notes[0].title == "数,英,编程老师"
|
||||
assert notes[0].tieba_name == "诸城吧"
|
||||
assert notes[0].total_replay_num == 19
|
||||
assert notes[0].publish_time
|
||||
|
||||
|
||||
def test_extract_note_detail_and_comments_from_current_pc_api():
|
||||
api_data = {
|
||||
"error_code": 0,
|
||||
"thread": {
|
||||
"id": 10451142633,
|
||||
"title": "这X尔斯对比巴尔斯,我只能说ID正确,允许居功自傲",
|
||||
"reply_num": 15,
|
||||
"create_time": 1769951446,
|
||||
},
|
||||
"forum": {"id": 1627732, "name": "dota2"},
|
||||
"page": {"total_page": 1},
|
||||
"first_floor": {
|
||||
"id": 153154064746,
|
||||
"author_id": 4089186644,
|
||||
"time": 1769951446,
|
||||
"content": [{"type": 0, "text": "皮队败决处刑德国编程钢琴师兼职数学家"}],
|
||||
},
|
||||
"post_list": [
|
||||
{
|
||||
"id": 153154097267,
|
||||
"author_id": 6614897968,
|
||||
"time": 1769952062,
|
||||
"content": [{"type": 0, "text": "xg现在大树阵容另一个辅助不选控制"}],
|
||||
"sub_post_number": 4,
|
||||
}
|
||||
],
|
||||
"user_list": [
|
||||
{
|
||||
"id": 4089186644,
|
||||
"name_show": "泰高祖蒙斯克",
|
||||
"portrait": "tb.1.f893a7af",
|
||||
"ip_address": "广东",
|
||||
},
|
||||
{
|
||||
"id": 6614897968,
|
||||
"name_show": "期胡希3",
|
||||
"portrait": "tb.1.4d0471d4",
|
||||
"ip_address": "河北",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
extractor = TieBaExtractor()
|
||||
note = extractor.extract_note_detail_from_api(api_data)
|
||||
comments = extractor.extract_tieba_note_parent_comments_from_api(api_data, note)
|
||||
|
||||
assert note.note_id == "10451142633"
|
||||
assert note.title == "这X尔斯对比巴尔斯,我只能说ID正确,允许居功自傲"
|
||||
assert note.desc == "皮队败决处刑德国编程钢琴师兼职数学家"
|
||||
assert note.user_nickname == "泰高祖蒙斯克"
|
||||
assert note.tieba_name == "dota2吧"
|
||||
assert note.total_replay_num == 15
|
||||
assert note.total_replay_page == 1
|
||||
assert note.ip_location == "广东"
|
||||
assert len(comments) == 1
|
||||
assert comments[0].comment_id == "153154097267"
|
||||
assert comments[0].content == "xg现在大树阵容另一个辅助不选控制"
|
||||
assert comments[0].user_nickname == "期胡希3"
|
||||
assert comments[0].sub_comment_count == 4
|
||||
assert comments[0].ip_location == "河北"
|
||||
|
||||
|
||||
def test_extract_creator_info_and_threads_from_current_pc_api():
|
||||
creator_api = {
|
||||
"error_code": 0,
|
||||
"data": {
|
||||
"user": {
|
||||
"id": 3546493137,
|
||||
"name": "拜月教Alice",
|
||||
"name_show": "米米世界大手子",
|
||||
"portrait": "tb.1.6ad0cd4a.7ZcjVYWa7UpHttCld2OppA?t=1777543466",
|
||||
"fans_num": 58,
|
||||
"concern_num": 1,
|
||||
"sex": 1,
|
||||
"tb_age": "7.8",
|
||||
"ip_address": "广东",
|
||||
}
|
||||
},
|
||||
}
|
||||
feed_api = {
|
||||
"error_code": 0,
|
||||
"data": {
|
||||
"list": [
|
||||
{"type": 1, "thread_info": {"id": 10208192951, "tid": 10208192951}},
|
||||
{"type": 1, "thread_info": {"id": 9835114923}},
|
||||
]
|
||||
},
|
||||
}
|
||||
|
||||
extractor = TieBaExtractor()
|
||||
creator = extractor.extract_creator_info_from_api(creator_api)
|
||||
thread_ids = extractor.extract_creator_thread_id_list_from_api(feed_api)
|
||||
|
||||
assert creator.user_id == "3546493137"
|
||||
assert creator.user_name == "拜月教Alice"
|
||||
assert creator.nickname == "米米世界大手子"
|
||||
assert creator.fans == 58
|
||||
assert creator.follows == 1
|
||||
assert creator.ip_location == "广东"
|
||||
assert creator.registration_duration == "7.8"
|
||||
assert thread_ids == ["10208192951", "9835114923"]
|
||||
|
||||
|
||||
def test_extract_tieba_note_list_from_current_frs_api():
|
||||
api_data = {
|
||||
"error_code": 0,
|
||||
"forum": {
|
||||
"id": 351091,
|
||||
"name": "加工中心",
|
||||
"tids": "10376710029,10636556989,",
|
||||
},
|
||||
}
|
||||
|
||||
notes = TieBaExtractor().extract_tieba_note_list_from_frs_api(api_data)
|
||||
|
||||
assert [note.note_id for note in notes] == ["10376710029", "10636556989"]
|
||||
assert notes[0].note_url == "https://tieba.baidu.com/p/10376710029"
|
||||
assert notes[0].tieba_name == "加工中心吧"
|
||||
assert notes[0].tieba_link.endswith("kw=%E5%8A%A0%E5%B7%A5%E4%B8%AD%E5%BF%83")
|
||||
|
||||
|
||||
def test_extract_tieba_note_list_from_bigpipe_thread_page():
|
||||
notes = TieBaExtractor().extract_tieba_note_list(read_fixture("tieba_note_list.html"))
|
||||
|
||||
assert len(notes) == 48
|
||||
assert notes[0].note_id == "9079949995"
|
||||
assert notes[0].title == "盗墓笔记全集+txt小说,已整理"
|
||||
assert notes[0].user_nickname == "公子伯仲"
|
||||
assert notes[0].tieba_name == "盗墓笔记吧"
|
||||
assert notes[0].tieba_link.endswith("kw=%E7%9B%97%E5%A2%93%E7%AC%94%E8%AE%B0&ie=utf-8")
|
||||
|
||||
|
||||
def test_extract_note_detail_from_post_page():
|
||||
note = TieBaExtractor().extract_note_detail(read_fixture("note_detail.html"))
|
||||
|
||||
assert note.note_id == "9117905169"
|
||||
assert note.title == "对于一个父亲来说,这个女儿14岁就死了"
|
||||
assert note.user_nickname == "章景轩"
|
||||
assert note.tieba_name == "以太比特吧"
|
||||
assert note.total_replay_num == 786
|
||||
assert note.total_replay_page == 13
|
||||
assert note.ip_location == "广东"
|
||||
|
||||
|
||||
def test_extract_parent_comments_from_post_page():
|
||||
comments = TieBaExtractor().extract_tieba_note_parment_comments(
|
||||
read_fixture("note_comments.html"),
|
||||
"9119688421",
|
||||
)
|
||||
|
||||
assert len(comments) == 30
|
||||
assert comments[0].comment_id == "150726491368"
|
||||
assert comments[0].content == "中国队第22金!无悬念!"
|
||||
assert comments[0].user_nickname == "heinzfrentzen"
|
||||
assert comments[0].tieba_name == "网球风云吧"
|
||||
assert comments[0].ip_location == "福建"
|
||||
|
||||
|
||||
def test_extract_sub_comments_with_class_token_matching():
|
||||
parent = TiebaComment(
|
||||
comment_id="150726496253",
|
||||
content="parent",
|
||||
note_id="9119688421",
|
||||
note_url="https://tieba.baidu.com/p/9119688421",
|
||||
tieba_id="4513750",
|
||||
tieba_name="网球风云吧",
|
||||
tieba_link="https://tieba.baidu.com/f?kw=%E7%BD%91%E7%90%83%E9%A3%8E%E4%BA%91",
|
||||
)
|
||||
|
||||
comments = TieBaExtractor().extract_tieba_note_sub_comments(
|
||||
read_fixture("note_sub_comments.html"),
|
||||
parent,
|
||||
)
|
||||
|
||||
assert len(comments) >= 10
|
||||
assert comments[0].comment_id
|
||||
assert comments[0].parent_comment_id == parent.comment_id
|
||||
assert comments[0].user_link.startswith("https://tieba.baidu.com/home/main")
|
||||
Reference in New Issue
Block a user