mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-05-11 21:17:38 +08:00
fix: xhs creator error
This commit is contained in:
@@ -20,7 +20,7 @@
|
||||
import asyncio
|
||||
import json
|
||||
from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union
|
||||
from urllib.parse import urlencode
|
||||
from urllib.parse import quote, urlencode
|
||||
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
@@ -39,7 +39,7 @@ from .exception import DataFetchError, IPBlockError, NoteNotFoundError
|
||||
from .field import SearchNoteType, SearchSortType
|
||||
from .help import get_search_id
|
||||
from .extractor import XiaoHongShuExtractor
|
||||
from .playwright_sign import sign_with_playwright
|
||||
from .playwright_sign import sign_with_xhshow
|
||||
|
||||
|
||||
class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
@@ -71,19 +71,16 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
self.init_proxy_pool(proxy_ip_pool)
|
||||
|
||||
async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict:
|
||||
"""Request header parameter signing (using playwright injection method)
|
||||
"""请求头参数签名 (使用 xhshow 纯算法)
|
||||
|
||||
Args:
|
||||
url: Request URL
|
||||
params: GET request parameters
|
||||
payload: POST request parameters
|
||||
url: 请求 URI path
|
||||
params: GET 请求参数
|
||||
payload: POST 请求参数
|
||||
|
||||
Returns:
|
||||
Dict: Signed request header parameters
|
||||
Dict: 签名后的请求头参数
|
||||
"""
|
||||
a1_value = self.cookie_dict.get("a1", "")
|
||||
|
||||
# Determine request data, method and URI
|
||||
if params is not None:
|
||||
data = params
|
||||
method = "GET"
|
||||
@@ -93,12 +90,11 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
else:
|
||||
raise ValueError("params or payload is required")
|
||||
|
||||
# Generate signature using playwright injection method
|
||||
signs = await sign_with_playwright(
|
||||
page=self.playwright_page,
|
||||
# 使用 xhshow 纯算法生成签名
|
||||
signs = sign_with_xhshow(
|
||||
uri=url,
|
||||
data=data,
|
||||
a1=a1_value,
|
||||
cookie_str=self.headers.get("Cookie", ""),
|
||||
method=method,
|
||||
)
|
||||
|
||||
@@ -152,6 +148,15 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
err_msg = data.get("msg", None) or f"{response.text}"
|
||||
raise DataFetchError(err_msg)
|
||||
|
||||
@staticmethod
|
||||
def _build_query_string(params: Dict) -> str:
|
||||
"""Build URL query string with encoding matching browser behavior (commas not encoded)"""
|
||||
parts = []
|
||||
for key, value in params.items():
|
||||
value_str = str(value) if value is not None else ""
|
||||
parts.append(f"{key}={quote(value_str, safe=',')}")
|
||||
return "&".join(parts)
|
||||
|
||||
async def get(self, uri: str, params: Optional[Dict] = None) -> Dict:
|
||||
"""
|
||||
GET request, signs request headers
|
||||
@@ -163,10 +168,15 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
|
||||
"""
|
||||
headers = await self._pre_headers(uri, params)
|
||||
full_url = f"{self._host}{uri}"
|
||||
# Build URL manually to ensure query string encoding matches the sign string
|
||||
# (httpx's default params encoding differs from browser/XHS frontend behavior)
|
||||
if params:
|
||||
full_url = f"{self._host}{uri}?{self._build_query_string(params)}"
|
||||
else:
|
||||
full_url = f"{self._host}{uri}"
|
||||
|
||||
return await self.request(
|
||||
method="GET", url=full_url, headers=headers, params=params
|
||||
method="GET", url=full_url, headers=headers
|
||||
)
|
||||
|
||||
async def post(self, uri: str, data: dict, **kwargs) -> Dict:
|
||||
@@ -568,6 +578,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin):
|
||||
"num": page_size,
|
||||
"cursor": cursor,
|
||||
"user_id": creator,
|
||||
"image_formats": "jpg,webp,avif",
|
||||
"xsec_token": xsec_token,
|
||||
"xsec_source": xsec_source,
|
||||
}
|
||||
|
||||
@@ -16,21 +16,61 @@
|
||||
# 详细许可条款请参阅项目根目录下的LICENSE文件。
|
||||
# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。
|
||||
|
||||
# Generate Xiaohongshu signature by calling window.mnsv2 via Playwright injection
|
||||
# Xiaohongshu signature generation using xhshow pure-algorithm library
|
||||
#
|
||||
# 致谢:本签名实现依赖 xhshow 开源库, 由 Cloxl 提供
|
||||
# 仓库地址: https://github.com/Cloxl/xhshow
|
||||
# 许可协议: MIT License
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import time
|
||||
from typing import Any, Dict, Optional, Union
|
||||
from urllib.parse import urlparse, quote
|
||||
from urllib.parse import quote
|
||||
|
||||
from playwright.async_api import Page
|
||||
from .xhs_sign import get_trace_id
|
||||
|
||||
from .xhs_sign import b64_encode, encode_utf8, get_trace_id, mrc
|
||||
|
||||
def _patch_xhshow_a3_hash():
|
||||
"""
|
||||
修复 xhshow 库 build_payload_array 中 a3_hash 计算的 bug。
|
||||
xhshow 原实现对所有请求使用 MD5(extract_api_path(content_string)) 计算 a3_hash,
|
||||
其中 extract_api_path 会同时去掉 "?" 后的查询参数和 "{" 后的 JSON body。
|
||||
但浏览器的实际行为是:
|
||||
- POST: a3 使用 MD5(api_path), 即去掉 JSON body 后的路径 → 原实现正确
|
||||
- GET: a3 使用 MD5(完整 URL + 查询参数) → 原实现错误, 因为也去掉了查询参数
|
||||
修复方式: 对 GET 请求(content_string 不含 "{"), 使用完整 content_string 的 MD5;
|
||||
对 POST 请求(content_string 含 "{"), 保持原始行为。
|
||||
相关 issue: https://github.com/Cloxl/xhshow/issues/104
|
||||
"""
|
||||
from xhshow.core.crypto import CryptoProcessor
|
||||
|
||||
_original_build = CryptoProcessor.build_payload_array
|
||||
|
||||
def _patched_build(self, hex_parameter, a1_value, app_identifier="xhs-pc-web",
|
||||
string_param="", timestamp=None, sign_state=None):
|
||||
payload = _original_build(self, hex_parameter, a1_value, app_identifier,
|
||||
string_param, timestamp, sign_state)
|
||||
# 仅当 content_string 不含 "{" 时修复 (即 GET 请求)
|
||||
if "{" not in string_param:
|
||||
correct_md5_hex = hashlib.md5(string_param.encode("utf-8")).hexdigest()
|
||||
correct_md5_bytes = [int(correct_md5_hex[i:i + 2], 16) for i in range(0, 32, 2)]
|
||||
seed_byte = payload[4]
|
||||
ts_bytes = payload[8:16]
|
||||
correct_a3_hash = self._custom_hash_v2(list(ts_bytes) + correct_md5_bytes)
|
||||
for i in range(16):
|
||||
payload[128 + i] = correct_a3_hash[i] ^ seed_byte
|
||||
return payload
|
||||
|
||||
CryptoProcessor.build_payload_array = _patched_build
|
||||
|
||||
|
||||
# 启动时应用 monkey-patch
|
||||
_patch_xhshow_a3_hash()
|
||||
|
||||
|
||||
def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method: str = "POST") -> str:
|
||||
"""Build string to be signed
|
||||
"""Build content string to be signed
|
||||
|
||||
Args:
|
||||
uri: API path
|
||||
@@ -38,10 +78,9 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
method: Request method (GET or POST)
|
||||
|
||||
Returns:
|
||||
String to be signed
|
||||
Content string for signing
|
||||
"""
|
||||
if method.upper() == "POST":
|
||||
# POST request uses JSON format
|
||||
c = uri
|
||||
if data is not None:
|
||||
if isinstance(data, dict):
|
||||
@@ -50,10 +89,8 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
c += data
|
||||
return c
|
||||
else:
|
||||
# GET request uses query string format
|
||||
if not data or (isinstance(data, dict) and len(data) == 0):
|
||||
return uri
|
||||
|
||||
if isinstance(data, dict):
|
||||
params = []
|
||||
for key in data.keys():
|
||||
@@ -64,9 +101,8 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
value_str = str(value)
|
||||
else:
|
||||
value_str = ""
|
||||
# Use URL encoding (safe parameter preserves certain characters from encoding)
|
||||
# Note: httpx will encode commas, equals signs, etc., we need to handle the same way
|
||||
value_str = quote(value_str, safe='')
|
||||
# URL encoding: preserve commas to match browser behavior
|
||||
value_str = quote(value_str, safe=',')
|
||||
params.append(f"{key}={value_str}")
|
||||
return f"{uri}?{'&'.join(params)}"
|
||||
elif isinstance(data, str):
|
||||
@@ -74,171 +110,67 @@ def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None, method
|
||||
return uri
|
||||
|
||||
|
||||
def _md5_hex(s: str) -> str:
|
||||
"""Calculate MD5 hash value"""
|
||||
return hashlib.md5(s.encode("utf-8")).hexdigest()
|
||||
|
||||
|
||||
def _build_xs_payload(x3_value: str, data_type: str = "object") -> str:
|
||||
"""Build x-s signature"""
|
||||
s = {
|
||||
"x0": "4.2.1",
|
||||
"x1": "xhs-pc-web",
|
||||
"x2": "Mac OS",
|
||||
"x3": x3_value,
|
||||
"x4": data_type,
|
||||
}
|
||||
return "XYS_" + b64_encode(encode_utf8(json.dumps(s, separators=(",", ":"))))
|
||||
|
||||
|
||||
def _build_xs_common(a1: str, b1: str, x_s: str, x_t: str) -> str:
|
||||
"""Build x-s-common request header"""
|
||||
payload = {
|
||||
"s0": 3,
|
||||
"s1": "",
|
||||
"x0": "1",
|
||||
"x1": "4.2.2",
|
||||
"x2": "Mac OS",
|
||||
"x3": "xhs-pc-web",
|
||||
"x4": "4.74.0",
|
||||
"x5": a1,
|
||||
"x6": x_t,
|
||||
"x7": x_s,
|
||||
"x8": b1,
|
||||
"x9": mrc(x_t + x_s + b1),
|
||||
"x10": 154,
|
||||
"x11": "normal",
|
||||
}
|
||||
return b64_encode(encode_utf8(json.dumps(payload, separators=(",", ":"))))
|
||||
|
||||
|
||||
async def get_b1_from_localstorage(page: Page) -> str:
|
||||
"""Get b1 value from localStorage"""
|
||||
try:
|
||||
local_storage = await page.evaluate("() => window.localStorage")
|
||||
return local_storage.get("b1", "")
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def call_mnsv2(page: Page, sign_str: str, md5_str: str) -> str:
|
||||
"""
|
||||
Call window.mnsv2 function via playwright
|
||||
|
||||
Args:
|
||||
page: playwright Page object
|
||||
sign_str: String to be signed (uri + JSON.stringify(data))
|
||||
md5_str: MD5 hash value of sign_str
|
||||
|
||||
Returns:
|
||||
Signature string returned by mnsv2
|
||||
"""
|
||||
sign_str_escaped = sign_str.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n")
|
||||
md5_str_escaped = md5_str.replace("\\", "\\\\").replace("'", "\\'")
|
||||
|
||||
try:
|
||||
result = await page.evaluate(f"window.mnsv2('{sign_str_escaped}', '{md5_str_escaped}')")
|
||||
return result if result else ""
|
||||
except Exception:
|
||||
return ""
|
||||
|
||||
|
||||
async def sign_xs_with_playwright(
|
||||
page: Page,
|
||||
def sign_with_xhshow(
|
||||
uri: str,
|
||||
data: Optional[Union[Dict, str]] = None,
|
||||
method: str = "POST",
|
||||
) -> str:
|
||||
"""
|
||||
Generate x-s signature via playwright injection
|
||||
|
||||
Args:
|
||||
page: playwright Page object (must have Xiaohongshu page open)
|
||||
uri: API path, e.g., "/api/sns/web/v1/search/notes"
|
||||
data: Request data (GET params or POST payload)
|
||||
method: Request method (GET or POST)
|
||||
|
||||
Returns:
|
||||
x-s signature string
|
||||
"""
|
||||
sign_str = _build_sign_string(uri, data, method)
|
||||
md5_str = _md5_hex(sign_str)
|
||||
x3_value = await call_mnsv2(page, sign_str, md5_str)
|
||||
data_type = "object" if isinstance(data, (dict, list)) else "string"
|
||||
return _build_xs_payload(x3_value, data_type)
|
||||
|
||||
|
||||
async def sign_with_playwright(
|
||||
page: Page,
|
||||
uri: str,
|
||||
data: Optional[Union[Dict, str]] = None,
|
||||
a1: str = "",
|
||||
cookie_str: str = "",
|
||||
method: str = "POST",
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Generate complete signature request headers via playwright
|
||||
使用 xhshow 纯算法生成完整签名请求头
|
||||
|
||||
Args:
|
||||
page: playwright Page object (must have Xiaohongshu page open)
|
||||
uri: API path
|
||||
data: Request data
|
||||
a1: a1 value from cookie
|
||||
data: Request data (GET params dict or POST payload dict)
|
||||
cookie_str: Cookie string
|
||||
method: Request method (GET or POST)
|
||||
|
||||
Returns:
|
||||
Dictionary containing x-s, x-t, x-s-common, x-b3-traceid
|
||||
"""
|
||||
b1 = await get_b1_from_localstorage(page)
|
||||
x_s = await sign_xs_with_playwright(page, uri, data, method)
|
||||
x_t = str(int(time.time() * 1000))
|
||||
from xhshow import Xhshow
|
||||
xhshow_client = Xhshow()
|
||||
|
||||
return {
|
||||
"x-s": x_s,
|
||||
"x-t": x_t,
|
||||
"x-s-common": _build_xs_common(a1, b1, x_s, x_t),
|
||||
"x-b3-traceid": get_trace_id(),
|
||||
}
|
||||
is_post = method.upper() == "POST"
|
||||
|
||||
|
||||
async def pre_headers_with_playwright(
|
||||
page: Page,
|
||||
url: str,
|
||||
cookie_dict: Dict[str, str],
|
||||
params: Optional[Dict] = None,
|
||||
payload: Optional[Dict] = None,
|
||||
) -> Dict[str, str]:
|
||||
"""
|
||||
Generate request header signature using playwright injection method
|
||||
Can directly replace _pre_headers method in client.py
|
||||
|
||||
Args:
|
||||
page: playwright Page object
|
||||
url: Request URL
|
||||
cookie_dict: Cookie dictionary
|
||||
params: GET request parameters
|
||||
payload: POST request parameters
|
||||
|
||||
Returns:
|
||||
Signed request header dictionary
|
||||
"""
|
||||
a1_value = cookie_dict.get("a1", "")
|
||||
uri = urlparse(url).path
|
||||
|
||||
# Determine request data and method
|
||||
if params is not None:
|
||||
data = params
|
||||
method = "GET"
|
||||
elif payload is not None:
|
||||
data = payload
|
||||
method = "POST"
|
||||
if is_post:
|
||||
headers = xhshow_client.sign_headers_post(
|
||||
uri=uri,
|
||||
cookies=cookie_str,
|
||||
payload=data if isinstance(data, dict) else {},
|
||||
)
|
||||
else:
|
||||
raise ValueError("params or payload is required")
|
||||
# GET 请求: 构建完整的 content_string 用于签名
|
||||
content_string = _build_sign_string(uri, data, method)
|
||||
cookie_dict = xhshow_client._parse_cookies(cookie_str)
|
||||
a1_value = cookie_dict.get("a1", "")
|
||||
|
||||
signs = await sign_with_playwright(page, uri, data, a1_value, method)
|
||||
ts = time.time()
|
||||
d_value = hashlib.md5(content_string.encode("utf-8")).hexdigest()
|
||||
|
||||
payload_array = xhshow_client.crypto_processor.build_payload_array(
|
||||
d_value, a1_value, "xhs-pc-web", content_string, ts
|
||||
)
|
||||
xor_result = xhshow_client.crypto_processor.bit_ops.xor_transform_array(payload_array)
|
||||
config = xhshow_client.config
|
||||
x3_b64 = xhshow_client.crypto_processor.b64encoder.encode_x3(
|
||||
xor_result[:config.PAYLOAD_LENGTH]
|
||||
)
|
||||
sig_data = config.SIGNATURE_DATA_TEMPLATE.copy()
|
||||
sig_data["x3"] = config.X3_PREFIX + x3_b64
|
||||
x_s = config.XYS_PREFIX + xhshow_client.crypto_processor.b64encoder.encode(
|
||||
json.dumps(sig_data, separators=(",", ":"), ensure_ascii=False)
|
||||
)
|
||||
headers = {
|
||||
"x-s": x_s,
|
||||
"x-s-common": xhshow_client.sign_xs_common(cookie_dict),
|
||||
"x-t": str(xhshow_client.get_x_t(ts)),
|
||||
"x-b3-traceid": xhshow_client.get_b3_trace_id(),
|
||||
}
|
||||
|
||||
return {
|
||||
"X-S": signs["x-s"],
|
||||
"X-T": signs["x-t"],
|
||||
"x-S-Common": signs["x-s-common"],
|
||||
"X-B3-Traceid": signs["x-b3-traceid"],
|
||||
"x-s": headers.get("x-s", ""),
|
||||
"x-t": headers.get("x-t", ""),
|
||||
"x-s-common": headers.get("x-s-common", ""),
|
||||
"x-b3-traceid": headers.get("x-b3-traceid", get_trace_id()),
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user