From b6caa7a85e56a21d9880fb7b1f21eb15ee540f82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Mon, 10 Nov 2025 21:10:03 +0800 Subject: [PATCH] refactor: add xhs creator params --- config/xhs_config.py | 9 +++------ media_platform/xhs/client.py | 23 +++++++++++++---------- media_platform/xhs/core.py | 2 ++ 3 files changed, 18 insertions(+), 16 deletions(-) diff --git a/config/xhs_config.py b/config/xhs_config.py index 434afd0..63f1a25 100644 --- a/config/xhs_config.py +++ b/config/xhs_config.py @@ -21,12 +21,9 @@ XHS_SPECIFIED_NOTE_URL_LIST = [ # ........................ ] -# 指定创作者URL列表 (支持完整URL或纯ID) -# 支持格式: -# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed" -# 2. 纯user_id: "63e36c9a000000002703502b" +# 指定创作者URL列表,需要携带xsec_token和xsec_source参数 + XHS_CREATOR_ID_LIST = [ - "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed", - "63e36c9a000000002703502b", + "https://www.xiaohongshu.com/user/profile/5f58bd990000000001003753?xsec_token=ABYVg1evluJZZzpMX-VWzchxQ1qSNVW3r-jOEnKqMcgZw=&xsec_source=pc_search" # ........................ ] diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 05166ec..3b0db95 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -142,7 +142,8 @@ class XiaoHongShuClient(AbstractApiClient): elif data["code"] == self.IP_ERROR_CODE: raise IPBlockError(self.IP_ERROR_STR) else: - raise DataFetchError(data.get("msg", None)) + err_msg = data.get("msg", None) or f"{response.text}" + raise DataFetchError(err_msg) async def get(self, uri: str, params=None) -> Dict: """ @@ -507,6 +508,8 @@ class XiaoHongShuClient(AbstractApiClient): creator: str, cursor: str, page_size: int = 30, + xsec_token: str = "", + xsec_source: str = "pc_feed", ) -> Dict: """ 获取博主的笔记 @@ -514,24 +517,22 @@ class XiaoHongShuClient(AbstractApiClient): creator: 博主ID cursor: 上一页最后一条笔记的ID page_size: 分页数据长度 + xsec_token: 验证token + xsec_source: 渠道来源 Returns: """ - uri = "/api/sns/web/v1/user_posted" - data = { - "user_id": creator, - "cursor": cursor, - "num": page_size, - "image_formats": "jpg,webp,avif", - } - return await self.get(uri, data) + uri = f"/api/sns/web/v1/user_posted?num={page_size}&cursor={cursor}&user_id={creator}&xsec_token={xsec_token}&xsec_source={xsec_source}" + return await self.get(uri) async def get_all_notes_by_creator( self, user_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, + xsec_token: str = "", + xsec_source: str = "pc_feed", ) -> List[Dict]: """ 获取指定用户下的所有发过的帖子,该方法会一直查找一个用户下的所有帖子信息 @@ -539,6 +540,8 @@ class XiaoHongShuClient(AbstractApiClient): user_id: 用户ID crawl_interval: 爬取一次的延迟单位(秒) callback: 一次分页爬取结束后的更新回调函数 + xsec_token: 验证token + xsec_source: 渠道来源 Returns: @@ -547,7 +550,7 @@ class XiaoHongShuClient(AbstractApiClient): notes_has_more = True notes_cursor = "" while notes_has_more and len(result) < config.CRAWLER_MAX_NOTES_COUNT: - notes_res = await self.get_notes_by_creator(user_id, notes_cursor) + notes_res = await self.get_notes_by_creator(user_id, notes_cursor, xsec_token=xsec_token, xsec_source=xsec_source) if not notes_res: utils.logger.error( f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data." diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 73c1c02..bbc8ee7 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -201,6 +201,8 @@ class XiaoHongShuCrawler(AbstractCrawler): user_id=user_id, crawl_interval=crawl_interval, callback=self.fetch_creator_notes_detail, + xsec_token=creator_info.xsec_token, + xsec_source=creator_info.xsec_source, ) note_ids = []