From f7d27ab43ad856fd938e1fe60b147c9b5fd51267 Mon Sep 17 00:00:00 2001 From: wanzirong Date: Wed, 21 Jan 2026 16:23:47 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=91=BD=E4=BB=A4?= =?UTF-8?q?=E8=A1=8C=E5=8F=82=E6=95=B0=E6=94=AF=E6=8C=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 添加 --max_comments_per_post 参数用于控制每个帖子爬取的评论数量 - 添加 --xhs_sort_type 参数用于控制小红书排序方式 - 修复小红书 core.py 中 CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES 的导入方式 从直接导入改为通过 config 模块访问,使命令行参数能正确生效 --- cmd_arg/arg.py | 21 +++++++++++++++++++++ media_platform/xhs/core.py | 3 +-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/cmd_arg/arg.py b/cmd_arg/arg.py index 0ae0a21..8e7c360 100644 --- a/cmd_arg/arg.py +++ b/cmd_arg/arg.py @@ -250,6 +250,22 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): rich_help_panel="Basic Configuration", ), ] = "", + max_comments_per_post: Annotated[ + int, + typer.Option( + "--max_comments_per_post", + help="Maximum number of first-level comments to crawl per post/video", + rich_help_panel="Comment Configuration", + ), + ] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + xhs_sort_type: Annotated[ + str, + typer.Option( + "--xhs_sort_type", + help="XiaoHongShu sort type (e.g., popularity_descending, time_descending)", + rich_help_panel="Platform Specific Configuration", + ), + ] = "", ) -> SimpleNamespace: """MediaCrawler 命令行入口""" @@ -274,6 +290,11 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None): config.CDP_HEADLESS = enable_headless config.SAVE_DATA_OPTION = save_data_option.value config.COOKIES = cookies + config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_per_post + + # Set XiaoHongShu sort type if specified + if xhs_sort_type and platform == PlatformEnum.XHS: + config.SORT_TYPE = xhs_sort_type # Set platform-specific ID lists for detail/creator mode if specified_id_list: diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 108d2c9..7047468 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -34,7 +34,6 @@ from tenacity import RetryError import config from base.base_crawler import AbstractCrawler -from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool from store import xhs as xhs_store @@ -344,7 +343,7 @@ class XiaoHongShuCrawler(AbstractCrawler): xsec_token=xsec_token, crawl_interval=crawl_interval, callback=xhs_store.batch_update_xhs_note_comments, - max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, + max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES, ) # Sleep after fetching comments