feat: 添加命令行参数支持

- 添加 --max_comments_per_post 参数用于控制每个帖子爬取的评论数量
- 添加 --xhs_sort_type 参数用于控制小红书排序方式
- 修复小红书 core.py 中 CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES 的导入方式
  从直接导入改为通过 config 模块访问,使命令行参数能正确生效
This commit is contained in:
wanzirong
2026-01-21 16:23:47 +08:00
parent be5b786a74
commit f7d27ab43a
2 changed files with 22 additions and 2 deletions

View File

@@ -250,6 +250,22 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
rich_help_panel="Basic Configuration",
),
] = "",
max_comments_per_post: Annotated[
int,
typer.Option(
"--max_comments_per_post",
help="Maximum number of first-level comments to crawl per post/video",
rich_help_panel="Comment Configuration",
),
] = config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
xhs_sort_type: Annotated[
str,
typer.Option(
"--xhs_sort_type",
help="XiaoHongShu sort type (e.g., popularity_descending, time_descending)",
rich_help_panel="Platform Specific Configuration",
),
] = "",
) -> SimpleNamespace:
"""MediaCrawler 命令行入口"""
@@ -274,6 +290,11 @@ async def parse_cmd(argv: Optional[Sequence[str]] = None):
config.CDP_HEADLESS = enable_headless
config.SAVE_DATA_OPTION = save_data_option.value
config.COOKIES = cookies
config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES = max_comments_per_post
# Set XiaoHongShu sort type if specified
if xhs_sort_type and platform == PlatformEnum.XHS:
config.SORT_TYPE = xhs_sort_type
# Set platform-specific ID lists for detail/creator mode
if specified_id_list:

View File

@@ -34,7 +34,6 @@ from tenacity import RetryError
import config
from base.base_crawler import AbstractCrawler
from config import CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES
from model.m_xiaohongshu import NoteUrlInfo, CreatorUrlInfo
from proxy.proxy_ip_pool import IpInfoModel, create_ip_pool
from store import xhs as xhs_store
@@ -344,7 +343,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
xsec_token=xsec_token,
crawl_interval=crawl_interval,
callback=xhs_store.batch_update_xhs_note_comments,
max_count=CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
max_count=config.CRAWLER_MAX_COMMENTS_COUNT_SINGLENOTES,
)
# Sleep after fetching comments