feat: xhs support creator url link

2026-05-08 03:27:36 +08:00 · 2025-10-18 07:20:09 +08:00
parent cae707cb2a
commit a9dd08680f
5 changed files with 97 additions and 14 deletions
--- a/config/xhs_config.py
+++ b/config/xhs_config.py
@@ -21,8 +21,12 @@ XHS_SPECIFIED_NOTE_URL_LIST = [
    # ........................
 ]

-# 指定用户ID列表
+# 指定创作者URL列表 (支持完整URL或纯ID)
+# 支持格式:
+# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
+# 2. 纯user_id: "63e36c9a000000002703502b"
 XHS_CREATOR_ID_LIST = [
-    "63e36c9a000000002703502b",
+    "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
+    "63e36c9a000000002703502b",    
    # ........................
 ]
--- a/media_platform/xhs/client.py
+++ b/media_platform/xhs/client.py
@@ -451,13 +451,26 @@ class XiaoHongShuClient(AbstractApiClient):
                result.extend(comments)
        return result

-    async def get_creator_info(self, user_id: str) -> Dict:
+    async def get_creator_info(
+        self, user_id: str, xsec_token: str = "", xsec_source: str = ""
+    ) -> Dict:
        """
        通过解析网页版的用户主页HTML，获取用户个人简要信息
        PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的，解析它即可
-        eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
+
+        Args:
+            user_id: 用户ID
+            xsec_token: 验证token (可选,如果URL中包含此参数则传入)
+            xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入)
+
+        Returns:
+            Dict: 创作者信息
        """
+        # 构建URI,如果有xsec参数则添加到URL中
        uri = f"/user/profile/{user_id}"
+        if xsec_token and xsec_source:
+            uri = f"{uri}?xsec_token={xsec_token}&xsec_source={xsec_source}"
+
        html_content = await self.request(
            "GET", self._domain + uri, return_response=True, headers=self.headers
        )
--- a/media_platform/xhs/core.py
+++ b/media_platform/xhs/core.py
@@ -26,7 +26,7 @@ from tenacity import RetryError
 import config
 from base.base_crawler import AbstractCrawler
 from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
-from model.m_xiaohongshu import NoteUrlInfo
+from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
 from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
 from store import xhs as xhs_store
 from tools import utils
@@ -36,7 +36,7 @@ from var import crawler_type_var, source_keyword_var
 from .client import XiaoHongShuClient
 from .exception import DataFetchError
 from .field import SearchSortType
-from .help import parse_note_info_from_note_url, get_search_id
+from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id
 from .login import XiaoHongShuLogin


@@ -174,11 +174,24 @@ class XiaoHongShuCrawler(AbstractCrawler):
    async def get_creators_and_notes(self) -> None:
        """Get creator's notes and retrieve their comment information."""
        utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
-        for user_id in config.XHS_CREATOR_ID_LIST:
-            # get creator detail info from web html content
-            createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
-            if createor_info:
-                await xhs_store.save_creator(user_id, creator=createor_info)
+        for creator_url in config.XHS_CREATOR_ID_LIST:
+            try:
+                # Parse creator URL to get user_id and security tokens
+                creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
+                utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Parse creator URL info: {creator_info}")
+                user_id = creator_info.user_id
+
+                # get creator detail info from web html content
+                createor_info: Dict = await self.xhs_client.get_creator_info(
+                    user_id=user_id,
+                    xsec_token=creator_info.xsec_token,
+                    xsec_source=creator_info.xsec_source
+                )
+                if createor_info:
+                    await xhs_store.save_creator(user_id, creator=createor_info)
+            except ValueError as e:
+                utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] Failed to parse creator URL: {e}")
+                continue

            # Use fixed crawling interval
            crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
@@ -271,7 +284,7 @@ class XiaoHongShuCrawler(AbstractCrawler):

                try:
                    note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
-                except RetryError as e:
+                except RetryError:
                    pass

                if not note_detail:
--- a/media_platform/xhs/help.py
+++ b/media_platform/xhs/help.py
@@ -15,7 +15,7 @@ import random
 import time
 import urllib.parse

-from model.m_xiaohongshu import NoteUrlInfo
+from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
 from tools.crawler_util import extract_url_params_to_dict


@@ -306,6 +306,37 @@ def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
    return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)


+def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
+    """
+    从小红书创作者主页URL中解析出创作者信息
+    支持以下格式:
+    1. 完整URL: "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
+    2. 纯ID: "5eb8e1d400000000010075ae"
+
+    Args:
+        url: 创作者主页URL或user_id
+    Returns:
+        CreatorUrlInfo: 包含user_id, xsec_token, xsec_source的对象
+    """
+    # 如果是纯ID格式(24位十六进制字符),直接返回
+    if len(url) == 24 and all(c in "0123456789abcdef" for c in url):
+        return CreatorUrlInfo(user_id=url, xsec_token="", xsec_source="")
+
+    # 从URL中提取user_id: /user/profile/xxx
+    import re
+    user_pattern = r'/user/profile/([^/?]+)'
+    match = re.search(user_pattern, url)
+    if match:
+        user_id = match.group(1)
+        # 提取xsec_token和xsec_source参数
+        params = extract_url_params_to_dict(url)
+        xsec_token = params.get("xsec_token", "")
+        xsec_source = params.get("xsec_source", "")
+        return CreatorUrlInfo(user_id=user_id, xsec_token=xsec_token, xsec_source=xsec_source)
+
+    raise ValueError(f"无法从URL中解析出创作者信息: {url}")
+
+
 if __name__ == '__main__':
    _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
    # 获取一个图片地址在多个cdn下的url地址
@@ -313,4 +344,19 @@ if __name__ == '__main__':
    final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
    print(final_img_url)

+    # 测试创作者URL解析
+    print("\n=== 创作者URL解析测试 ===")
+    test_creator_urls = [
+        "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
+        "5eb8e1d400000000010075ae",
+    ]
+    for url in test_creator_urls:
+        try:
+            result = parse_creator_info_from_url(url)
+            print(f"✓ URL: {url[:80]}...")
+            print(f"  结果: {result}\n")
+        except Exception as e:
+            print(f"✗ URL: {url}")
+            print(f"  错误: {e}\n")
+

--- a/model/m_xiaohongshu.py
+++ b/model/m_xiaohongshu.py
@@ -18,4 +18,11 @@ from pydantic import BaseModel, Field
 class NoteUrlInfo(BaseModel):
    note_id: str = Field(title="note id")
    xsec_token: str = Field(title="xsec token")
-    xsec_source: str = Field(title="xsec source")
+    xsec_source: str = Field(title="xsec source")
+
+
+class CreatorUrlInfo(BaseModel):
+    """小红书创作者URL信息"""
+    user_id: str = Field(title="user id (creator id)")
+    xsec_token: str = Field(default="", title="xsec token")
+    xsec_source: str = Field(default="", title="xsec source")