From a9dd08680f15179f2ddc4d869758d6b813986c00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E7=A8=8B=E5=BA=8F=E5=91=98=E9=98=BF=E6=B1=9F=28Relakkes?= =?UTF-8?q?=29?= Date: Sat, 18 Oct 2025 07:20:09 +0800 Subject: [PATCH] feat: xhs support creator url link --- config/xhs_config.py | 8 ++++-- media_platform/xhs/client.py | 17 +++++++++++-- media_platform/xhs/core.py | 29 ++++++++++++++++------ media_platform/xhs/help.py | 48 +++++++++++++++++++++++++++++++++++- model/m_xiaohongshu.py | 9 ++++++- 5 files changed, 97 insertions(+), 14 deletions(-) diff --git a/config/xhs_config.py b/config/xhs_config.py index 485277a..9296905 100644 --- a/config/xhs_config.py +++ b/config/xhs_config.py @@ -21,8 +21,12 @@ XHS_SPECIFIED_NOTE_URL_LIST = [ # ........................ ] -# 指定用户ID列表 +# 指定创作者URL列表 (支持完整URL或纯ID) +# 支持格式: +# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed" +# 2. 纯user_id: "63e36c9a000000002703502b" XHS_CREATOR_ID_LIST = [ - "63e36c9a000000002703502b", + "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed", + "63e36c9a000000002703502b", # ........................ ] diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 9df39c1..c538874 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -451,13 +451,26 @@ class XiaoHongShuClient(AbstractApiClient): result.extend(comments) return result - async def get_creator_info(self, user_id: str) -> Dict: + async def get_creator_info( + self, user_id: str, xsec_token: str = "", xsec_source: str = "" + ) -> Dict: """ 通过解析网页版的用户主页HTML,获取用户个人简要信息 PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可 - eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217 + + Args: + user_id: 用户ID + xsec_token: 验证token (可选,如果URL中包含此参数则传入) + xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入) + + Returns: + Dict: 创作者信息 """ + # 构建URI,如果有xsec参数则添加到URL中 uri = f"/user/profile/{user_id}" + if xsec_token and xsec_source: + uri = f"{uri}?xsec_token={xsec_token}&xsec_source={xsec_source}" + html_content = await self.request( "GET", self._domain + uri, return_response=True, headers=self.headers ) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index f228392..536c1ca 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -26,7 +26,7 @@ from tenacity import RetryError import config from base.base_crawler import AbstractCrawler from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES -from model.m_xiaohongshu import NoteUrlInfo +from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import xhs as xhs_store from tools import utils @@ -36,7 +36,7 @@ from var import crawler_type_var, source_keyword_var from .client import XiaoHongShuClient from .exception import DataFetchError from .field import SearchSortType -from .help import parse_note_info_from_note_url, get_search_id +from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id from .login import XiaoHongShuLogin @@ -174,11 +174,24 @@ class XiaoHongShuCrawler(AbstractCrawler): async def get_creators_and_notes(self) -> None: """Get creator's notes and retrieve their comment information.""" utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators") - for user_id in config.XHS_CREATOR_ID_LIST: - # get creator detail info from web html content - createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id) - if createor_info: - await xhs_store.save_creator(user_id, creator=createor_info) + for creator_url in config.XHS_CREATOR_ID_LIST: + try: + # Parse creator URL to get user_id and security tokens + creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url) + utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Parse creator URL info: {creator_info}") + user_id = creator_info.user_id + + # get creator detail info from web html content + createor_info: Dict = await self.xhs_client.get_creator_info( + user_id=user_id, + xsec_token=creator_info.xsec_token, + xsec_source=creator_info.xsec_source + ) + if createor_info: + await xhs_store.save_creator(user_id, creator=createor_info) + except ValueError as e: + utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] Failed to parse creator URL: {e}") + continue # Use fixed crawling interval crawl_interval = config.CRAWLER_MAX_SLEEP_SEC @@ -271,7 +284,7 @@ class XiaoHongShuCrawler(AbstractCrawler): try: note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) - except RetryError as e: + except RetryError: pass if not note_detail: diff --git a/media_platform/xhs/help.py b/media_platform/xhs/help.py index 3d96811..2838b67 100644 --- a/media_platform/xhs/help.py +++ b/media_platform/xhs/help.py @@ -15,7 +15,7 @@ import random import time import urllib.parse -from model.m_xiaohongshu import NoteUrlInfo +from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo from tools.crawler_util import extract_url_params_to_dict @@ -306,6 +306,37 @@ def parse_note_info_from_note_url(url: str) -> NoteUrlInfo: return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source) +def parse_creator_info_from_url(url: str) -> CreatorUrlInfo: + """ + 从小红书创作者主页URL中解析出创作者信息 + 支持以下格式: + 1. 完整URL: "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed" + 2. 纯ID: "5eb8e1d400000000010075ae" + + Args: + url: 创作者主页URL或user_id + Returns: + CreatorUrlInfo: 包含user_id, xsec_token, xsec_source的对象 + """ + # 如果是纯ID格式(24位十六进制字符),直接返回 + if len(url) == 24 and all(c in "0123456789abcdef" for c in url): + return CreatorUrlInfo(user_id=url, xsec_token="", xsec_source="") + + # 从URL中提取user_id: /user/profile/xxx + import re + user_pattern = r'/user/profile/([^/?]+)' + match = re.search(user_pattern, url) + if match: + user_id = match.group(1) + # 提取xsec_token和xsec_source参数 + params = extract_url_params_to_dict(url) + xsec_token = params.get("xsec_token", "") + xsec_source = params.get("xsec_source", "") + return CreatorUrlInfo(user_id=user_id, xsec_token=xsec_token, xsec_source=xsec_source) + + raise ValueError(f"无法从URL中解析出创作者信息: {url}") + + if __name__ == '__main__': _img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3" # 获取一个图片地址在多个cdn下的url地址 @@ -313,4 +344,19 @@ if __name__ == '__main__': final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url)) print(final_img_url) + # 测试创作者URL解析 + print("\n=== 创作者URL解析测试 ===") + test_creator_urls = [ + "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed", + "5eb8e1d400000000010075ae", + ] + for url in test_creator_urls: + try: + result = parse_creator_info_from_url(url) + print(f"✓ URL: {url[:80]}...") + print(f" 结果: {result}\n") + except Exception as e: + print(f"✗ URL: {url}") + print(f" 错误: {e}\n") + diff --git a/model/m_xiaohongshu.py b/model/m_xiaohongshu.py index 53294c6..abccb63 100644 --- a/model/m_xiaohongshu.py +++ b/model/m_xiaohongshu.py @@ -18,4 +18,11 @@ from pydantic import BaseModel, Field class NoteUrlInfo(BaseModel): note_id: str = Field(title="note id") xsec_token: str = Field(title="xsec token") - xsec_source: str = Field(title="xsec source") \ No newline at end of file + xsec_source: str = Field(title="xsec source") + + +class CreatorUrlInfo(BaseModel): + """小红书创作者URL信息""" + user_id: str = Field(title="user id (creator id)") + xsec_token: str = Field(default="", title="xsec token") + xsec_source: str = Field(default="", title="xsec source") \ No newline at end of file