mirror of
https://github.com/NanmiCoder/MediaCrawler.git
synced 2026-02-06 23:21:33 +08:00
feat: xhs support creator url link
This commit is contained in:
@@ -21,8 +21,12 @@ XHS_SPECIFIED_NOTE_URL_LIST = [
|
|||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
# 指定用户ID列表
|
# 指定创作者URL列表 (支持完整URL或纯ID)
|
||||||
|
# 支持格式:
|
||||||
|
# 1. 完整创作者主页URL (带xsec_token和xsec_source参数): "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||||
|
# 2. 纯user_id: "63e36c9a000000002703502b"
|
||||||
XHS_CREATOR_ID_LIST = [
|
XHS_CREATOR_ID_LIST = [
|
||||||
"63e36c9a000000002703502b",
|
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
|
||||||
|
"63e36c9a000000002703502b",
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -451,13 +451,26 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
result.extend(comments)
|
result.extend(comments)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
async def get_creator_info(self, user_id: str) -> Dict:
|
async def get_creator_info(
|
||||||
|
self, user_id: str, xsec_token: str = "", xsec_source: str = ""
|
||||||
|
) -> Dict:
|
||||||
"""
|
"""
|
||||||
通过解析网页版的用户主页HTML,获取用户个人简要信息
|
通过解析网页版的用户主页HTML,获取用户个人简要信息
|
||||||
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
|
PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可
|
||||||
eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217
|
|
||||||
|
Args:
|
||||||
|
user_id: 用户ID
|
||||||
|
xsec_token: 验证token (可选,如果URL中包含此参数则传入)
|
||||||
|
xsec_source: 渠道来源 (可选,如果URL中包含此参数则传入)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: 创作者信息
|
||||||
"""
|
"""
|
||||||
|
# 构建URI,如果有xsec参数则添加到URL中
|
||||||
uri = f"/user/profile/{user_id}"
|
uri = f"/user/profile/{user_id}"
|
||||||
|
if xsec_token and xsec_source:
|
||||||
|
uri = f"{uri}?xsec_token={xsec_token}&xsec_source={xsec_source}"
|
||||||
|
|
||||||
html_content = await self.request(
|
html_content = await self.request(
|
||||||
"GET", self._domain + uri, return_response=True, headers=self.headers
|
"GET", self._domain + uri, return_response=True, headers=self.headers
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ from tenacity import RetryError
|
|||||||
import config
|
import config
|
||||||
from base.base_crawler import AbstractCrawler
|
from base.base_crawler import AbstractCrawler
|
||||||
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
|
||||||
from model.m_xiaohongshu import NoteUrlInfo
|
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
|
||||||
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
|
||||||
from store import xhs as xhs_store
|
from store import xhs as xhs_store
|
||||||
from tools import utils
|
from tools import utils
|
||||||
@@ -36,7 +36,7 @@ from var import crawler_type_var, source_keyword_var
|
|||||||
from .client import XiaoHongShuClient
|
from .client import XiaoHongShuClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
from .field import SearchSortType
|
from .field import SearchSortType
|
||||||
from .help import parse_note_info_from_note_url, get_search_id
|
from .help import parse_note_info_from_note_url, parse_creator_info_from_url, get_search_id
|
||||||
from .login import XiaoHongShuLogin
|
from .login import XiaoHongShuLogin
|
||||||
|
|
||||||
|
|
||||||
@@ -174,11 +174,24 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
async def get_creators_and_notes(self) -> None:
|
async def get_creators_and_notes(self) -> None:
|
||||||
"""Get creator's notes and retrieve their comment information."""
|
"""Get creator's notes and retrieve their comment information."""
|
||||||
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
|
utils.logger.info("[XiaoHongShuCrawler.get_creators_and_notes] Begin get xiaohongshu creators")
|
||||||
for user_id in config.XHS_CREATOR_ID_LIST:
|
for creator_url in config.XHS_CREATOR_ID_LIST:
|
||||||
# get creator detail info from web html content
|
try:
|
||||||
createor_info: Dict = await self.xhs_client.get_creator_info(user_id=user_id)
|
# Parse creator URL to get user_id and security tokens
|
||||||
if createor_info:
|
creator_info: CreatorUrlInfo = parse_creator_info_from_url(creator_url)
|
||||||
await xhs_store.save_creator(user_id, creator=createor_info)
|
utils.logger.info(f"[XiaoHongShuCrawler.get_creators_and_notes] Parse creator URL info: {creator_info}")
|
||||||
|
user_id = creator_info.user_id
|
||||||
|
|
||||||
|
# get creator detail info from web html content
|
||||||
|
createor_info: Dict = await self.xhs_client.get_creator_info(
|
||||||
|
user_id=user_id,
|
||||||
|
xsec_token=creator_info.xsec_token,
|
||||||
|
xsec_source=creator_info.xsec_source
|
||||||
|
)
|
||||||
|
if createor_info:
|
||||||
|
await xhs_store.save_creator(user_id, creator=createor_info)
|
||||||
|
except ValueError as e:
|
||||||
|
utils.logger.error(f"[XiaoHongShuCrawler.get_creators_and_notes] Failed to parse creator URL: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
# Use fixed crawling interval
|
# Use fixed crawling interval
|
||||||
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
crawl_interval = config.CRAWLER_MAX_SLEEP_SEC
|
||||||
@@ -271,7 +284,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
note_detail = await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||||
except RetryError as e:
|
except RetryError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if not note_detail:
|
if not note_detail:
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import random
|
|||||||
import time
|
import time
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
|
|
||||||
from model.m_xiaohongshu import NoteUrlInfo
|
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
|
||||||
from tools.crawler_util import extract_url_params_to_dict
|
from tools.crawler_util import extract_url_params_to_dict
|
||||||
|
|
||||||
|
|
||||||
@@ -306,6 +306,37 @@ def parse_note_info_from_note_url(url: str) -> NoteUrlInfo:
|
|||||||
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
return NoteUrlInfo(note_id=note_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_creator_info_from_url(url: str) -> CreatorUrlInfo:
|
||||||
|
"""
|
||||||
|
从小红书创作者主页URL中解析出创作者信息
|
||||||
|
支持以下格式:
|
||||||
|
1. 完整URL: "https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed"
|
||||||
|
2. 纯ID: "5eb8e1d400000000010075ae"
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: 创作者主页URL或user_id
|
||||||
|
Returns:
|
||||||
|
CreatorUrlInfo: 包含user_id, xsec_token, xsec_source的对象
|
||||||
|
"""
|
||||||
|
# 如果是纯ID格式(24位十六进制字符),直接返回
|
||||||
|
if len(url) == 24 and all(c in "0123456789abcdef" for c in url):
|
||||||
|
return CreatorUrlInfo(user_id=url, xsec_token="", xsec_source="")
|
||||||
|
|
||||||
|
# 从URL中提取user_id: /user/profile/xxx
|
||||||
|
import re
|
||||||
|
user_pattern = r'/user/profile/([^/?]+)'
|
||||||
|
match = re.search(user_pattern, url)
|
||||||
|
if match:
|
||||||
|
user_id = match.group(1)
|
||||||
|
# 提取xsec_token和xsec_source参数
|
||||||
|
params = extract_url_params_to_dict(url)
|
||||||
|
xsec_token = params.get("xsec_token", "")
|
||||||
|
xsec_source = params.get("xsec_source", "")
|
||||||
|
return CreatorUrlInfo(user_id=user_id, xsec_token=xsec_token, xsec_source=xsec_source)
|
||||||
|
|
||||||
|
raise ValueError(f"无法从URL中解析出创作者信息: {url}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
_img_url = "https://sns-img-bd.xhscdn.com/7a3abfaf-90c1-a828-5de7-022c80b92aa3"
|
||||||
# 获取一个图片地址在多个cdn下的url地址
|
# 获取一个图片地址在多个cdn下的url地址
|
||||||
@@ -313,4 +344,19 @@ if __name__ == '__main__':
|
|||||||
final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
|
final_img_url = get_img_url_by_trace_id(get_trace_id(_img_url))
|
||||||
print(final_img_url)
|
print(final_img_url)
|
||||||
|
|
||||||
|
# 测试创作者URL解析
|
||||||
|
print("\n=== 创作者URL解析测试 ===")
|
||||||
|
test_creator_urls = [
|
||||||
|
"https://www.xiaohongshu.com/user/profile/5eb8e1d400000000010075ae?xsec_token=AB1nWBKCo1vE2HEkfoJUOi5B6BE5n7wVrbdpHoWIj5xHw=&xsec_source=pc_feed",
|
||||||
|
"5eb8e1d400000000010075ae",
|
||||||
|
]
|
||||||
|
for url in test_creator_urls:
|
||||||
|
try:
|
||||||
|
result = parse_creator_info_from_url(url)
|
||||||
|
print(f"✓ URL: {url[:80]}...")
|
||||||
|
print(f" 结果: {result}\n")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"✗ URL: {url}")
|
||||||
|
print(f" 错误: {e}\n")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -18,4 +18,11 @@ from pydantic import BaseModel, Field
|
|||||||
class NoteUrlInfo(BaseModel):
|
class NoteUrlInfo(BaseModel):
|
||||||
note_id: str = Field(title="note id")
|
note_id: str = Field(title="note id")
|
||||||
xsec_token: str = Field(title="xsec token")
|
xsec_token: str = Field(title="xsec token")
|
||||||
xsec_source: str = Field(title="xsec source")
|
xsec_source: str = Field(title="xsec source")
|
||||||
|
|
||||||
|
|
||||||
|
class CreatorUrlInfo(BaseModel):
|
||||||
|
"""小红书创作者URL信息"""
|
||||||
|
user_id: str = Field(title="user id (creator id)")
|
||||||
|
xsec_token: str = Field(default="", title="xsec token")
|
||||||
|
xsec_source: str = Field(default="", title="xsec source")
|
||||||
Reference in New Issue
Block a user