From f989ce07880ffa1a4d7bbacdc3202eeb38fa1c08 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Thu, 27 Nov 2025 10:53:08 +0800 Subject: [PATCH] feat: xhs sign playwright version --- media_platform/xhs/client.py | 58 +++----- media_platform/xhs/playwright_sign.py | 203 ++++++++++++++++++++++++++ media_platform/xhs/xhs_sign.py | 152 +++++++++++++++++++ 3 files changed, 373 insertions(+), 40 deletions(-) create mode 100644 media_platform/xhs/playwright_sign.py create mode 100644 media_platform/xhs/xhs_sign.py diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 0a8f074..a68ec6f 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -19,15 +19,12 @@ import asyncio import json -import time from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Union -from urllib.parse import urlencode, urlparse, parse_qs - +from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page from tenacity import retry, stop_after_attempt, wait_fixed -from xhshow import Xhshow import config from base.base_crawler import AbstractApiClient @@ -39,8 +36,9 @@ if TYPE_CHECKING: from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType -from .help import get_search_id, sign +from .help import get_search_id from .extractor import XiaoHongShuExtractor +from .playwright_sign import sign_with_playwright class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): @@ -67,16 +65,14 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): self.playwright_page = playwright_page self.cookie_dict = cookie_dict self._extractor = XiaoHongShuExtractor() - # 初始化 xhshow 客户端用于签名生成 - self._xhshow_client = Xhshow() # 初始化代理池(来自 ProxyRefreshMixin) self.init_proxy_pool(proxy_ip_pool) async def _pre_headers(self, url: str, params: Optional[Dict] = None, payload: Optional[Dict] = None) -> Dict: - """请求头参数签名 + """请求头参数签名(使用 playwright 注入方式) Args: - url: 请求的URL(GET请求是包含请求的参数) + url: 请求的URL params: GET请求的参数 payload: POST请求的参数 @@ -84,37 +80,21 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): Dict: 请求头参数签名 """ a1_value = self.cookie_dict.get("a1", "") - parsed = urlparse(url) - uri = parsed.path + + # 确定请求数据和 URI if params is not None: - x_s = self._xhshow_client.sign_xs_get( - uri=uri, a1_value=a1_value, params=params - ) + data = params elif payload is not None: - x_s = self._xhshow_client.sign_xs_post( - uri=uri, a1_value=a1_value, payload=payload - ) + data = payload else: raise ValueError("params or payload is required") - # 获取 b1 值 - b1_value = "" - try: - if self.playwright_page: - local_storage = await self.playwright_page.evaluate( - "() => window.localStorage" - ) - b1_value = local_storage.get("b1", "") - except Exception as e: - utils.logger.warning( - f"[XiaoHongShuClient._pre_headers] Failed to get b1 from localStorage: {e}" - ) - - signs = sign( + # 使用 playwright 注入方式生成签名 + signs = await sign_with_playwright( + page=self.playwright_page, + uri=url, + data=data, a1=a1_value, - b1=b1_value, - x_s=x_s, - x_t=str(int(time.time() * 1000)), ) headers = { @@ -177,11 +157,9 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): """ headers = await self._pre_headers(uri, params) if isinstance(params, dict): - # 使用 xhsshow build_url 构建完整的 URL - full_url = self._xhshow_client.build_url( - base_url=f"{self._host}{uri}", - params=params - ) + # 构建带参数的完整 URL + query_string = urlencode(params) + full_url = f"{self._host}{uri}?{query_string}" else: full_url = f"{self._host}{uri}" @@ -200,7 +178,7 @@ class XiaoHongShuClient(AbstractApiClient, ProxyRefreshMixin): """ headers = await self._pre_headers(uri, payload=data) - json_str = self._xhshow_client.build_json_body(payload=data) + json_str = json.dumps(data, separators=(",", ":"), ensure_ascii=False) return await self.request( method="POST", url=f"{self._host}{uri}", diff --git a/media_platform/xhs/playwright_sign.py b/media_platform/xhs/playwright_sign.py new file mode 100644 index 0000000..28051c8 --- /dev/null +++ b/media_platform/xhs/playwright_sign.py @@ -0,0 +1,203 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/xhs/playwright_sign.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# 通过 Playwright 注入调用 window.mnsv2 生成小红书签名 + +import hashlib +import json +import time +from typing import Any, Dict, Optional, Union +from urllib.parse import urlparse + +from playwright.async_api import Page + +from .xhs_sign import b64_encode, encode_utf8, get_trace_id, mrc + + +def _build_sign_string(uri: str, data: Optional[Union[Dict, str]] = None) -> str: + """构建待签名字符串""" + c = uri + if data is not None: + if isinstance(data, dict): + c += json.dumps(data, separators=(",", ":"), ensure_ascii=False) + elif isinstance(data, str): + c += data + return c + + +def _md5_hex(s: str) -> str: + """计算 MD5 哈希值""" + return hashlib.md5(s.encode("utf-8")).hexdigest() + + +def _build_xs_payload(x3_value: str, data_type: str = "object") -> str: + """构建 x-s 签名""" + s = { + "x0": "4.2.1", + "x1": "xhs-pc-web", + "x2": "Mac OS", + "x3": x3_value, + "x4": data_type, + } + return "XYS_" + b64_encode(encode_utf8(json.dumps(s, separators=(",", ":")))) + + +def _build_xs_common(a1: str, b1: str, x_s: str, x_t: str) -> str: + """构建 x-s-common 请求头""" + payload = { + "s0": 3, + "s1": "", + "x0": "1", + "x1": "4.2.2", + "x2": "Mac OS", + "x3": "xhs-pc-web", + "x4": "4.74.0", + "x5": a1, + "x6": x_t, + "x7": x_s, + "x8": b1, + "x9": mrc(x_t + x_s + b1), + "x10": 154, + "x11": "normal", + } + return b64_encode(encode_utf8(json.dumps(payload, separators=(",", ":")))) + + +async def get_b1_from_localstorage(page: Page) -> str: + """从 localStorage 获取 b1 值""" + try: + local_storage = await page.evaluate("() => window.localStorage") + return local_storage.get("b1", "") + except Exception: + return "" + + +async def call_mnsv2(page: Page, sign_str: str, md5_str: str) -> str: + """ + 通过 playwright 调用 window.mnsv2 函数 + + Args: + page: playwright Page 对象 + sign_str: 待签名字符串 (uri + JSON.stringify(data)) + md5_str: sign_str 的 MD5 哈希值 + + Returns: + mnsv2 返回的签名字符串 + """ + sign_str_escaped = sign_str.replace("\\", "\\\\").replace("'", "\\'").replace("\n", "\\n") + md5_str_escaped = md5_str.replace("\\", "\\\\").replace("'", "\\'") + + try: + result = await page.evaluate(f"window.mnsv2('{sign_str_escaped}', '{md5_str_escaped}')") + return result if result else "" + except Exception: + return "" + + +async def sign_xs_with_playwright( + page: Page, + uri: str, + data: Optional[Union[Dict, str]] = None, +) -> str: + """ + 通过 playwright 注入生成 x-s 签名 + + Args: + page: playwright Page 对象(必须已打开小红书页面) + uri: API 路径,如 "/api/sns/web/v1/search/notes" + data: 请求数据(GET 的 params 或 POST 的 payload) + + Returns: + x-s 签名字符串 + """ + sign_str = _build_sign_string(uri, data) + md5_str = _md5_hex(sign_str) + x3_value = await call_mnsv2(page, sign_str, md5_str) + data_type = "object" if isinstance(data, (dict, list)) else "string" + return _build_xs_payload(x3_value, data_type) + + +async def sign_with_playwright( + page: Page, + uri: str, + data: Optional[Union[Dict, str]] = None, + a1: str = "", +) -> Dict[str, Any]: + """ + 通过 playwright 生成完整的签名请求头 + + Args: + page: playwright Page 对象(必须已打开小红书页面) + uri: API 路径 + data: 请求数据 + a1: cookie 中的 a1 值 + + Returns: + 包含 x-s, x-t, x-s-common, x-b3-traceid 的字典 + """ + b1 = await get_b1_from_localstorage(page) + x_s = await sign_xs_with_playwright(page, uri, data) + x_t = str(int(time.time() * 1000)) + + return { + "x-s": x_s, + "x-t": x_t, + "x-s-common": _build_xs_common(a1, b1, x_s, x_t), + "x-b3-traceid": get_trace_id(), + } + + +async def pre_headers_with_playwright( + page: Page, + url: str, + cookie_dict: Dict[str, str], + params: Optional[Dict] = None, + payload: Optional[Dict] = None, +) -> Dict[str, str]: + """ + 使用 playwright 注入方式生成请求头签名 + 可直接替换 client.py 中的 _pre_headers 方法 + + Args: + page: playwright Page 对象 + url: 请求 URL + cookie_dict: cookie 字典 + params: GET 请求参数 + payload: POST 请求参数 + + Returns: + 签名后的请求头字典 + """ + a1_value = cookie_dict.get("a1", "") + uri = urlparse(url).path + + if params is not None: + data = params + elif payload is not None: + data = payload + else: + raise ValueError("params or payload is required") + + signs = await sign_with_playwright(page, uri, data, a1_value) + + return { + "X-S": signs["x-s"], + "X-T": signs["x-t"], + "x-S-Common": signs["x-s-common"], + "X-B3-Traceid": signs["x-b3-traceid"], + } diff --git a/media_platform/xhs/xhs_sign.py b/media_platform/xhs/xhs_sign.py new file mode 100644 index 0000000..3e3836a --- /dev/null +++ b/media_platform/xhs/xhs_sign.py @@ -0,0 +1,152 @@ +# -*- coding: utf-8 -*- +# Copyright (c) 2025 relakkes@gmail.com +# +# This file is part of MediaCrawler project. +# Repository: https://github.com/NanmiCoder/MediaCrawler/blob/main/media_platform/xhs/xhs_sign.py +# GitHub: https://github.com/NanmiCoder +# Licensed under NON-COMMERCIAL LEARNING LICENSE 1.1 +# +# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: +# 1. 不得用于任何商业用途。 +# 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 +# 3. 不得进行大规模爬取或对平台造成运营干扰。 +# 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 +# 5. 不得用于任何非法或不当的用途。 +# +# 详细许可条款请参阅项目根目录下的LICENSE文件。 +# 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 + +# 小红书签名算法核心函数 +# 用于 playwright 注入方式生成签名 + +import ctypes +import random +from urllib.parse import quote + +# 自定义 Base64 字符表 +# 标准 Base64: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/ +# 小红书打乱顺序用于混淆 +BASE64_CHARS = list("ZmserbBoHQtNP+wOcza/LpngG8yJq42KWYj0DSfdikx3VT16IlUAFM97hECvuRX5") + +# CRC32 查表 +CRC32_TABLE = [ + 0, 1996959894, 3993919788, 2567524794, 124634137, 1886057615, 3915621685, + 2657392035, 249268274, 2044508324, 3772115230, 2547177864, 162941995, + 2125561021, 3887607047, 2428444049, 498536548, 1789927666, 4089016648, + 2227061214, 450548861, 1843258603, 4107580753, 2211677639, 325883990, + 1684777152, 4251122042, 2321926636, 335633487, 1661365465, 4195302755, + 2366115317, 997073096, 1281953886, 3579855332, 2724688242, 1006888145, + 1258607687, 3524101629, 2768942443, 901097722, 1119000684, 3686517206, + 2898065728, 853044451, 1172266101, 3705015759, 2882616665, 651767980, + 1373503546, 3369554304, 3218104598, 565507253, 1454621731, 3485111705, + 3099436303, 671266974, 1594198024, 3322730930, 2970347812, 795835527, + 1483230225, 3244367275, 3060149565, 1994146192, 31158534, 2563907772, + 4023717930, 1907459465, 112637215, 2680153253, 3904427059, 2013776290, + 251722036, 2517215374, 3775830040, 2137656763, 141376813, 2439277719, + 3865271297, 1802195444, 476864866, 2238001368, 4066508878, 1812370925, + 453092731, 2181625025, 4111451223, 1706088902, 314042704, 2344532202, + 4240017532, 1658658271, 366619977, 2362670323, 4224994405, 1303535960, + 984961486, 2747007092, 3569037538, 1256170817, 1037604311, 2765210733, + 3554079995, 1131014506, 879679996, 2909243462, 3663771856, 1141124467, + 855842277, 2852801631, 3708648649, 1342533948, 654459306, 3188396048, + 3373015174, 1466479909, 544179635, 3110523913, 3462522015, 1591671054, + 702138776, 2966460450, 3352799412, 1504918807, 783551873, 3082640443, + 3233442989, 3988292384, 2596254646, 62317068, 1957810842, 3939845945, + 2647816111, 81470997, 1943803523, 3814918930, 2489596804, 225274430, + 2053790376, 3826175755, 2466906013, 167816743, 2097651377, 4027552580, + 2265490386, 503444072, 1762050814, 4150417245, 2154129355, 426522225, + 1852507879, 4275313526, 2312317920, 282753626, 1742555852, 4189708143, + 2394877945, 397917763, 1622183637, 3604390888, 2714866558, 953729732, + 1340076626, 3518719985, 2797360999, 1068828381, 1219638859, 3624741850, + 2936675148, 906185462, 1090812512, 3747672003, 2825379669, 829329135, + 1181335161, 3412177804, 3160834842, 628085408, 1382605366, 3423369109, + 3138078467, 570562233, 1426400815, 3317316542, 2998733608, 733239954, + 1555261956, 3268935591, 3050360625, 752459403, 1541320221, 2607071920, + 3965973030, 1969922972, 40735498, 2617837225, 3943577151, 1913087877, + 83908371, 2512341634, 3803740692, 2075208622, 213261112, 2463272603, + 3855990285, 2094854071, 198958881, 2262029012, 4057260610, 1759359992, + 534414190, 2176718541, 4139329115, 1873836001, 414664567, 2282248934, + 4279200368, 1711684554, 285281116, 2405801727, 4167216745, 1634467795, + 376229701, 2685067896, 3608007406, 1308918612, 956543938, 2808555105, + 3495958263, 1231636301, 1047427035, 2932959818, 3654703836, 1088359270, + 936918000, 2847714899, 3736837829, 1202900863, 817233897, 3183342108, + 3401237130, 1404277552, 615818150, 3134207493, 3453421203, 1423857449, + 601450431, 3009837614, 3294710456, 1567103746, 711928724, 3020668471, + 3272380065, 1510334235, 755167117, +] + + +def _right_shift_unsigned(num: int, bit: int = 0) -> int: + """JavaScript 无符号右移 (>>>) 的 Python 实现""" + val = ctypes.c_uint32(num).value >> bit + MAX32INT = 4294967295 + return (val + (MAX32INT + 1)) % (2 * (MAX32INT + 1)) - MAX32INT - 1 + + +def mrc(e: str) -> int: + """CRC32 变体,用于 x-s-common 的 x9 字段""" + o = -1 + for n in range(min(57, len(e))): + o = CRC32_TABLE[(o & 255) ^ ord(e[n])] ^ _right_shift_unsigned(o, 8) + return o ^ -1 ^ 3988292384 + + +def _triplet_to_base64(e: int) -> str: + """将 24 位整数转换为 4 个 Base64 字符""" + return ( + BASE64_CHARS[(e >> 18) & 63] + + BASE64_CHARS[(e >> 12) & 63] + + BASE64_CHARS[(e >> 6) & 63] + + BASE64_CHARS[e & 63] + ) + + +def _encode_chunk(data: list, start: int, end: int) -> str: + """编码数据块""" + result = [] + for i in range(start, end, 3): + c = ((data[i] << 16) & 0xFF0000) + ((data[i + 1] << 8) & 0xFF00) + (data[i + 2] & 0xFF) + result.append(_triplet_to_base64(c)) + return "".join(result) + + +def encode_utf8(s: str) -> list: + """将字符串编码为 UTF-8 字节列表""" + encoded = quote(s, safe="~()*!.'") + result = [] + i = 0 + while i < len(encoded): + if encoded[i] == "%": + result.append(int(encoded[i + 1: i + 3], 16)) + i += 3 + else: + result.append(ord(encoded[i])) + i += 1 + return result + + +def b64_encode(data: list) -> str: + """自定义 Base64 编码""" + length = len(data) + remainder = length % 3 + chunks = [] + + main_length = length - remainder + for i in range(0, main_length, 16383): + chunks.append(_encode_chunk(data, i, min(i + 16383, main_length))) + + if remainder == 1: + a = data[length - 1] + chunks.append(BASE64_CHARS[a >> 2] + BASE64_CHARS[(a << 4) & 63] + "==") + elif remainder == 2: + a = (data[length - 2] << 8) + data[length - 1] + chunks.append( + BASE64_CHARS[a >> 10] + BASE64_CHARS[(a >> 4) & 63] + BASE64_CHARS[(a << 2) & 63] + "=" + ) + + return "".join(chunks) + + +def get_trace_id() -> str: + """生成链路追踪 trace id""" + return "".join(random.choice("abcdef0123456789") for _ in range(16))